In [187]:
# importeer nodige libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

__INTERNE RED WINES DATASET__

In [188]:
# lees het bestand in
redwines = pd.read_csv("redwine.csv", delimiter=";", encoding='iso-8859-1')

# verwijder onnodige kolommen
redwines.drop(['country','variety','id'],axis=1, inplace=True)

# voeg jaartal toe als aparte kolom, en verwijder nan rijen voor jaren (2465 naar 2461)
redwines['year'] = redwines['title'].str.extract(r"([1][9][9]\d|[2][0][0,1,2]\d)")
redwines = redwines[redwines['year'].notnull()]
redwines['year'] = redwines['year'].astype(int)
redwines.set_index('year', inplace=True)

# drop rows with nan value in it
redwines.dropna(inplace=True)
redwines.head(10)

Unnamed: 0_level_0,description,designation,points,price,province,taster_name,title,winery,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Kerin OKeefe,Quinta dos Avidagos 2011 Avidagos Red (Douro),Quinta dos Avidagos,10.0,0.29,0.4,2.9,0.098,10.0,26.0,10.006,3.48,0.91,9.7
2013,"From an estate in the south of the Alentejo, t...",Grande Reserva Tinto,91,26.0,Alentejano,Roger Voss,Casa Santa Vitória 2013 Grande Reserva Tinto R...,Casa Santa Vitória,8.3,0.26,0.37,1.4,0.076,8.0,23.0,0.9974,3.26,0.7,9.6
2012,A year in wood and 30 months in bottle before ...,Montes Claros Garrafeira,90,28.0,Alentejo,Paul Gregutt,Adega Cooperativa de Borba 2012 Montes Claros ...,Adega Cooperativa de Borba,7.0,0.69,0.07,2.5,0.091,15.0,21.0,0.99572,3.38,0.6,11.3
2010,"A powerful wine, richly structured and full of...",Gerações Colheita Seleccionada,92,34.0,Alentejano,Paul Gregutt,Herdade Grande 2010 Gerações Colheita Seleccio...,Herdade Grande,8.0,0.48,0.34,2.2,0.073,16.0,25.0,0.9936,3.28,0.66,12.4
2013,This is a new wine from the hands of master wi...,Monte de Carrapatelo Colheita Seleccionada Tinto,92,30.0,Alentejano,Michael Schachner,Luis Duarte 2013 Monte de Carrapatelo Colheita...,Luis Duarte,8.2,0.23,0.42,1.9,0.069,9.0,17.0,0.99376,3.21,0.54,12.3
2013,As its name suggests this is a selection of ba...,Lagoalva Barrel Selection Tinto,92,23.0,Tejo,Kerin OKeefe,Quinta da Lagoalva de Cima 2013 Lagoalva Barre...,Quinta da Lagoalva de Cima,10.8,0.47,0.43,2.1,0.171,27.0,66.0,0.9982,3.17,0.76,10.8
2005,"Powerful and concentrated, this is just at its...",Grande Reserva,92,32.0,Alentejo,Anna Lee C. Iijima,Monte da Penha 2005 Grande Reserva Red (Alentejo),Monte da Penha,7.9,0.33,0.41,1.5,0.056,6.0,35.0,0.99396,3.29,0.71,11.0
2008,"This mature wine has a soft, smooth character ...",VT '08,90,32.0,Douro,Roger Voss,Quinta do Sagrado 2008 VT '08 Red (Douro),Quinta do Sagrado,7.0,0.69,0.07,2.5,0.091,15.0,21.0,0.99572,3.38,0.6,11.3
2012,"A ripe and structured wine, this has bold blac...",Aneto Tinto,90,12.0,Douro,Virginie Boone,Sobredos 2012 Aneto Tinto Red (Douro),Sobredos,8.8,0.27,0.46,2.1,0.095,20.0,29.0,0.99488,3.26,0.56,11.3
2014,Just lightly touched by wood this is a bright ...,Evel Tinto,87,13.0,Douro,Virginie Boone,Real Companhia Velha 2014 Evel Tinto Red (Douro),Real Companhia Velha,9.3,0.27,0.41,2.0,0.091,6.0,16.0,0.998,3.28,0.7,9.7


__EXTERNE WEER DATASET__

In [189]:
# lees de externe dataset in
weather_conditions = pd.read_csv("weatherdata_lisbon.csv", delimiter=";", encoding='iso-8859-1')

# drop onbruikbare rijen en hernoem kolommen
weather_conditions.drop(['STN---','Unnamed: 5','Unnamed: 7','GUST','PRCP','SNDP','VISIB'], axis=1,inplace=True)
weather_conditions.dropna(inplace=True)
weather_conditions.rename(columns={'YEARMODA': 'Year','DEWP':'Dewpoint Temperature','TEMP': 'Temperature','MAX':'Max Temperature','MIN':'Min Temperature','WDSP':'Windspeed','MXSPD':'Max Windspeed'},inplace=True)

# cleanup and transform data types
# also convert fahrenheit to celsius
weather_conditions['Max Temperature'] = (weather_conditions['Max Temperature'].str.replace(',','.').str.rstrip("*").astype(float) - 32) / 1.8
weather_conditions['Min Temperature'] = (weather_conditions['Min Temperature'].str.replace(',','.').str.rstrip("*").astype(float) - 32) / 1.8
weather_conditions['Temperature'] = (weather_conditions['Temperature'].str.replace(',','.').astype(float) - 32) / 1.8
weather_conditions['Dewpoint Temperature'] = (weather_conditions['Dewpoint Temperature'].str.replace(',','.').astype(float) - 32) / 1.8
weather_conditions['Windspeed'] = weather_conditions['Windspeed'].str.replace(',','.').astype(float)
weather_conditions['Max Windspeed'] = weather_conditions['Max Windspeed'].str.replace(',','.').astype(float)
weather_conditions['Year']= pd.to_datetime(weather_conditions['Year'].astype(str), format='%Y-%m-%d')

# FRSHTT = Frost, Rain, Snow, Hail, Thunder, Thornado
weather_conditions['FRSHTT'] = weather_conditions['FRSHTT'].apply(lambda x: '{0:0>6}'.format(x))
weather_conditions['Frost'] = weather_conditions['FRSHTT'].str[0].astype(float)
weather_conditions['Rain'] = weather_conditions['FRSHTT'].str[1].astype(float)
weather_conditions['Snow'] = weather_conditions['FRSHTT'].str[2].astype(float)
weather_conditions['Hail'] = weather_conditions['FRSHTT'].str[3].astype(float)
weather_conditions['Thunder'] = weather_conditions['FRSHTT'].str[4].astype(float)
weather_conditions['Tornado'] = weather_conditions['FRSHTT'].str[5].astype(float)
weather_conditions.drop('FRSHTT',axis=1,inplace=True)

# verwijder uitschieters
weather_conditions = weather_conditions[weather_conditions['Min Temperature'] < 50]
weather_conditions = weather_conditions[weather_conditions['Max Temperature'] < 50]
weather_conditions = weather_conditions[weather_conditions['Temperature'] < 50]
weather_conditions = weather_conditions[weather_conditions['Windspeed'] < 50]
weather_conditions = weather_conditions[weather_conditions['Max Windspeed'] < 50]
weather_conditions = weather_conditions[weather_conditions['Dewpoint Temperature'] < 100]

# bereken gemiddelde eigenschappen per jaar
weather_conditions['Count'] = 1
average_weather_conditions_peryear = weather_conditions.groupby(weather_conditions['Year'].map(lambda x: x.year)).sum()
average_weather_conditions_peryear = average_weather_conditions_peryear[['Temperature','Dewpoint Temperature','Windspeed','Max Windspeed','Max Temperature','Min Temperature','Frost','Rain','Snow','Hail','Thunder','Tornado']].div(average_weather_conditions_peryear['Count'], axis=0)
average_weather_conditions_peryear.drop(pd.Int64Index([2018]), inplace=True) # 2018 heeft maar 1 meting

average_weather_conditions_peryear

Unnamed: 0_level_0,Temperature,Dewpoint Temperature,Windspeed,Max Windspeed,Max Temperature,Min Temperature,Frost,Rain,Snow,Hail,Thunder,Tornado
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1990,17.434877,12.111111,6.596375,13.130816,22.187311,13.63142,0.054381,0.338369,0.0,0.0,0.066465,0.0
1991,17.642094,10.155449,7.080128,14.254487,22.685897,13.711538,0.086538,0.282051,0.0,0.0,0.028846,0.0
1992,16.661886,10.002308,7.333828,14.245401,21.635015,12.795252,0.094955,0.267062,0.002967,0.002967,0.020772,0.0
1993,16.396541,11.520661,6.842975,12.932507,20.738292,12.757576,0.096419,0.347107,0.002755,0.002755,0.030303,0.00551
1994,16.997398,12.179063,7.210468,13.538292,21.46832,13.451791,0.110193,0.297521,0.00551,0.0,0.041322,0.0
1995,18.023569,11.901286,6.990634,13.589256,22.515152,14.4573,0.07438,0.30854,0.0,0.0,0.035813,0.002755
1996,17.137675,12.306163,7.124863,13.982514,21.494536,13.598361,0.054645,0.385246,0.002732,0.0,0.057377,0.0
1997,18.222375,13.332265,7.43544,13.345055,22.56044,14.725275,0.14011,0.379121,0.008242,0.008242,0.06044,0.0
1998,17.598954,12.297014,7.892244,13.558449,22.274238,13.880886,0.146814,0.274238,0.00277,0.0,0.027701,0.0
1999,17.39426,12.65178,8.221271,13.896409,21.651934,13.889503,0.118785,0.309392,0.002762,0.0,0.027624,0.0


__MERGING EXTERNE EN INTERNE DATASETS__

In [190]:
redwines_year_comparison = redwines[['points', 'province']]
#redwines_year_comparison = redwines_year_comparison[redwines_year_comparison['province'] == 'Tejo']
redwines_with_yeartemperatures = redwines_year_comparison.merge(average_weather_conditions_peryear, left_index=True, right_index=True)
#redwines_with_yeartemperatures['province'].value_counts()

__CREATING THE MODEL__

In [191]:
# ['Temperature', 'Dewpoint Temperature','Windspeed','Max Windspeed','Max Temperature','Min Temperature','Frost','Rain','Snow','Hail','Thunder','Tornado']
# dependent and independent columns
dependent = redwines_with_yeartemperatures['points']
independent = redwines_with_yeartemperatures.drop(['points', 'province'], axis=1)

# test and train set
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, random_state=0, test_size=0.2)

# setting up and training model
model = LogisticRegression()
model.fit(X_train, y_train)

# testing and calculating accuracy
#print(model.predict(X_test))
print(model.score(X_test,y_test))

# LinearRegression 8%
# DecisionTreeClassifier 12%
# LogisticRegression 15%
# BayesianRidge 6%
# SVR 5%
# DecisionTreeRegressor 7%
# Ridge 5%

0.18181818181818182


