In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
url = "winemag-data_first150k.csv"
df = pd.read_csv(url)
print(df.shape)
print(df.columns.tolist()[:12])
df.head(3)

(150930, 11)
['Unnamed: 0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'variety', 'winery']


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley


In [None]:
missing = df.isnull().sum().sort_values(ascending=False)
missing.head(10)

region_2       89977
designation    45735
region_1       25060
price          13695
province           5
country            5
Unnamed: 0         0
description        0
points             0
variety            0
dtype: int64

In [None]:
df_clean = df.copy()
num_cols = df_clean.select_dtypes(include=np.number).columns
cat_cols = df_clean.select_dtypes(include='object').columns
for c in num_cols:
    df_clean[c] = df_clean[c].fillna(df_clean[c].median())
for c in cat_cols:
    if df_clean[c].isnull().any():
        df_clean[c] = df_clean[c].fillna(df_clean[c].mode()[0])
df_clean.isnull().sum().head(10)

Unnamed: 0     0
country        0
description    0
designation    0
points         0
price          0
province       0
region_1       0
region_2       0
variety        0
dtype: int64

In [None]:
# feature engineering steps = price_per_point  and is_expenesive
df_fe = df_clean.copy()
df_fe['price_per_point'] = df_fe.apply(lambda r: r['price']/r['points'] if r.get('points', np.nan) not in [0, np.nan] else np.nan, axis=1)
df_fe['price_per_point'] = df_fe['price_per_point'].fillna(df_fe['price_per_point'].median())
q75 = df_fe['price_per_point'].quantile(0.75)
df_fe['is_expensive'] = (df_fe['price'] > q75).astype(int)
df_fe[['price', 'points', 'price_per_point', 'is_expensive']].head(10)

Unnamed: 0,price,points,price_per_point,is_expensive
0,235.0,96,2.447917,1
1,110.0,96,1.145833,1
2,90.0,96,0.9375,1
3,65.0,96,0.677083,1
4,66.0,95,0.694737,1
5,73.0,95,0.768421,1
6,65.0,95,0.684211,1
7,110.0,95,1.157895,1
8,65.0,95,0.684211,1
9,60.0,95,0.631579,1


In [None]:
# One-hot encoding 
feature_cols_num = ['points','price','price_per_point']
feature_cols_cat = ['country','province','variety','winery']

x_base = df_fe[feature_cols_num + feature_cols_cat].copy()
y = df_fe['is_expensive']


def top_n_categories(s, n=20):
    top = s.value_counts().nlargest(n).index
    return s.where(s.isin(top), other='__OTHER__')

for c in feature_cols_cat:
    x_base[c] = top_n_categories(x_base[c], n=20)

x = pd.get_dummies(x_base, columns=feature_cols_cat, drop_first=True)
x.head(3)

Unnamed: 0,points,price,price_per_point,country_Australia,country_Austria,country_Canada,country_Chile,country_Croatia,country_France,country_Germany,...,winery_Joseph Drouhin,winery_Kendall-Jackson,winery_Kenwood,winery_Louis Latour,winery_Robert Mondavi,winery_Testarossa,winery_Trapiche,winery_Williams Selyem,winery_Wines & Winemakers,winery___OTHER__
0,96,235.0,2.447917,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,96,110.0,1.145833,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,96,90.0,0.9375,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [None]:
scaler = StandardScaler()
x_num = pd.DataFrame(scaler.fit_transform(x[feature_cols_num]), columns=feature_cols_num, index=x.index)
x_sacled = x.copy()
x_sacled[feature_cols_num] = x_num
x_sacled.head(3)


Unnamed: 0,points,price,price_per_point,country_Australia,country_Austria,country_Canada,country_Chile,country_Croatia,country_France,country_Germany,...,winery_Joseph Drouhin,winery_Kendall-Jackson,winery_Kenwood,winery_Louis Latour,winery_Robert Mondavi,winery_Testarossa,winery_Trapiche,winery_Williams Selyem,winery_Wines & Winemakers,winery___OTHER__
0,2.517263,5.835613,5.662144,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,2.517263,2.236885,2.127696,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,2.517263,1.661089,1.562185,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [None]:
# build model
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00     30186

    accuracy                           1.00     30186
   macro avg       1.00      1.00      1.00     30186
weighted avg       1.00      1.00      1.00     30186

