In [1]:
import pandas as pd
from sqlalchemy import create_engine

### Store CSV into DataFrame

In [2]:
csv_file = "Resources/winemag-data_first150k_for_project.csv"
winemag_data_df = pd.read_csv(csv_file)
winemag_data_df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,34920,France,"A big, powerful wine that sums up the richness...",,99,2300.0,Bordeaux,Pauillac,,Bordeaux-style Red Blend,Château Latour
1,13318,US,The nose on this single-vineyard wine from a s...,Roger Rose Vineyard,91,2013.0,California,Arroyo Seco,Central Coast,Chardonnay,Blair
2,34922,France,"A massive wine for Margaux, packed with tannin...",,98,1900.0,Bordeaux,Margaux,,Bordeaux-style Red Blend,Château Margaux
3,26296,France,A wine that has created its own universe. It h...,Clos du Mesnil,100,1400.0,Champagne,Champagne,,Chardonnay,Krug
4,51886,France,A wine that has created its own universe. It h...,Clos du Mesnil,100,1400.0,Champagne,Champagne,,Chardonnay,Krug


### Create new data with select columns

In [3]:
new_winemag_data_df = winemag_data_df[['country', 'description', 'points', 'price', 'province', 'variety', 'winery']].copy()
new_winemag_data_df.head()

Unnamed: 0,country,description,points,price,province,variety,winery
0,France,"A big, powerful wine that sums up the richness...",99,2300.0,Bordeaux,Bordeaux-style Red Blend,Château Latour
1,US,The nose on this single-vineyard wine from a s...,91,2013.0,California,Chardonnay,Blair
2,France,"A massive wine for Margaux, packed with tannin...",98,1900.0,Bordeaux,Bordeaux-style Red Blend,Château Margaux
3,France,A wine that has created its own universe. It h...,100,1400.0,Champagne,Chardonnay,Krug
4,France,A wine that has created its own universe. It h...,100,1400.0,Champagne,Chardonnay,Krug


### Basic Cleaning

In [7]:
# Check for missing information
new_winemag_data_df.count()

country        150925
description    150930
points         150930
price          137235
province       150925
variety        150930
winery         150930
dtype: int64

In [8]:
# Drop all rows with missing information
new_winemag_data_df = new_winemag_data_df.dropna(how='any')

In [9]:
# Check that all columns have the same amount of rows now. 
new_winemag_data_df.count()

country        137230
description    137230
points         137230
price          137230
province       137230
variety        137230
winery         137230
dtype: int64

In [10]:
# Check that the data types fit the data. 
new_winemag_data_df.dtypes

country         object
description     object
points           int64
price          float64
province        object
variety         object
winery          object
dtype: object

In [13]:
# look at the values in the 'variety' column.
new_winemag_data_df['variety'].value_counts()

Chardonnay               13775
Pinot Noir               13625
Cabernet Sauvignon       12671
Red Blend                 9377
Sauvignon Blanc           6054
                         ...  
Baga-Touriga Nacional        1
Rebula                       1
Albarín                      1
Syrah-Carignan               1
Premsal                      1
Name: variety, Length: 619, dtype: int64

In [None]:
# Isolate the top 5 or 6 wine types: 'Chardonnay', 'Pinot Noir', 'Cabernet Sauvignon', 'Red Blend', 'sauvignon Blanc' 

### Connect to local database

In [6]:
rds_connection_string = "postgres:postgres@localhost:5432/winemag_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

ModuleNotFoundError: No module named 'psycopg2'

### Check for tables

In [None]:
engine.table_names()

### Use pandas to load csv converted DataFrame into database

In [None]:
new_winemag_data_df.to_sql(name='winemag', con=engine, if_exists='append', index=False)