In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import classification_report


In [None]:
wine_base = pd.read_csv("Our_dataset/StemmedWord2vecTop3_parsed_weather_labeled.csv", index_col=0)
wine_base = wine_base.reset_index()
wine_base= wine_base[pd.notnull(wine_base['description'])]
wine_base

In [None]:
fig, ax = plt.subplots(figsize = (12, 5))
sns.distplot(wine_base["points"],hist=True,bins = 20,hist_kws={'edgecolor':'black'})

# DEFINE NUMBER OF BINS

In [None]:
Y = wine_base['points'].copy()
#DECIDE NUMBER OF BINS 
#bins = 5
#labels=["very_low", "low", "medium","high","very_high"]
bins  = 3
labels=["low","medium","high"]
Y = pd.cut(Y,bins,labels=labels)  

In [None]:
Y

In [None]:
basic = ['vintage', 'country', 'price', 'province', 'region_1', 'taster_name', 'variety','winery']
word = ["word_count"]
tfGroup = ['tf_grouped_1','tf_grouped_2', 'tf_grouped_3']
tfIdfGroup = ['tfIdf_grouped_1', 'tfIdf_grouped_2', 'tfIdf_grouped_3']
tfFull = ['tf_fullData_1', 'tf_fullData_2', 'tf_fullData_3',]
tfIdfFull = ['tfIdf_fullData_1', 'tfIdf_fullData_2', 'tfIdf_fullData_3']
weather = ['pr_5', 'pr_6', 'pr_7', 'pr_8', 'pr_9', 'tas_5', 'tas_6', 'tas_7', 'tas_8', 'tas_9']
word2vec = ['similarityTop3WinesByVariety']

In [None]:
features = basic + word + word2vec + tfGroup + weather
X = wine_base.loc[:,features]
X.columns

# REAL CLASSIFIER

In [None]:
test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
classifier = DecisionTreeClassifier()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)  
acc = accuracy_score(y_test,y_pred)
weightedPrec = precision_score(y_test,y_pred,average="weighted")
#get feature importances
lista = []
for name, importance in zip(features, classifier.feature_importances_):
    lista.append([name, importance])

In [None]:
print(classification_report(y_test, y_pred, target_names=labels))

check AUC, feature selection, and data with sparse matrix

file = open("classification" + str(bins) + "bins.txt","a")
file.write("Decision Tree Classifier with points divided into " + str(bins) + " bins \n")
file.write("Built on: " + str(features) + "\n")
file.write("Test size: " + str(test_size) + "\n")
file.write("   " +"accuracy " + str(acc) + "\n")
file.write("   " +"weightedPrec " + str(weightedPrec) + "\n")
file.write("Feature importances: \n")
for el in lista: 
    file.write("   " + el[0] + ":   " + str(el[1]) + "\n")
file.write("Report By predicted class: \n")
file.write(classification_report(y_test, y_pred, target_names=labels))
file.close()

# DUMMY CLASSIFIER = BASELINE

In [None]:
test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
#classifier = DecisionTreeClassifier()
classifier = DummyClassifier("stratified")
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)  
acc = accuracy_score(y_test,y_pred)
weightedPrec = precision_score(y_test,y_pred,average="weighted")
#get feature importances
lista = []
#for name, importance in zip(features, classifier.feature_importances_):
 #   lista.append([name, importance])

file = open("dummyClassifier.txt","a")
file.write("Decision Tree Classifier with points divided into " + str(bins) + " bins \n")
file.write("Built on: " + str(features) + "\n")
file.write("Test size: " + str(test_size) + "\n")
file.write("   " +"accuracy " + str(acc) + "\n")
file.write("   " +"weightedPrec " + str(weightedPrec) + "\n")
#file.write("Feature importances: \n")
#for el in lista: 
#file.write("   " + el[0] + ":   " + str(el[1]) + "\n")
file.write("Report By predicted class: \n")
file.write(classification_report(y_test, y_pred, target_names=labels))
file.close()