In [7]:
# importeer nodige libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

__INTERNE RED WINES DATASET__

In [8]:
# lees het bestand in
redwines = pd.read_csv("redwine.csv", delimiter=";", encoding='iso-8859-1')

# verwijder onnodige kolommen
redwines.drop(['country','variety','id'],axis=1, inplace=True)

# voeg jaartal toe als aparte kolom, en verwijder nan rijen voor jaren (2465 naar 2461)
redwines['year'] = redwines['title'].str.extract(r"([1][9][9]\d|[2][0][0,1,2]\d)")
redwines = redwines[redwines['year'].notnull()]
redwines['year'] = redwines['year'].astype(int)
redwines.set_index('year', inplace=True)

# drop rows with nan value in it
redwines.dropna(inplace=True)

__EXTERNE WEER DATASET__

In [9]:
# lees de externe dataset in
weather_conditions = pd.read_csv("weatherdata_lisbon.csv", delimiter=";", encoding='iso-8859-1')

# drop onbruikbare rijen en hernoem kolommen
weather_conditions.drop(['STN---','Unnamed: 5','Unnamed: 7','GUST','PRCP','SNDP','VISIB'], axis=1,inplace=True)
weather_conditions.dropna(inplace=True)
weather_conditions.rename(columns={'YEARMODA': 'Year','DEWP':'Dewpoint Temperature','TEMP': 'Temperature','MAX':'Max Temperature','MIN':'Min Temperature','WDSP':'Windspeed','MXSPD':'Max Windspeed'},inplace=True)

# cleanup and transform data types
# also convert fahrenheit to celsius
weather_conditions['Max Temperature'] = (weather_conditions['Max Temperature'].str.replace(',','.').str.rstrip("*").astype(float) - 32) / 1.8
weather_conditions['Min Temperature'] = (weather_conditions['Min Temperature'].str.replace(',','.').str.rstrip("*").astype(float) - 32) / 1.8
weather_conditions['Temperature'] = (weather_conditions['Temperature'].str.replace(',','.').astype(float) - 32) / 1.8
weather_conditions['Dewpoint Temperature'] = (weather_conditions['Dewpoint Temperature'].str.replace(',','.').astype(float) - 32) / 1.8
weather_conditions['Windspeed'] = weather_conditions['Windspeed'].str.replace(',','.').astype(float)
weather_conditions['Max Windspeed'] = weather_conditions['Max Windspeed'].str.replace(',','.').astype(float)
weather_conditions['Year']= pd.to_datetime(weather_conditions['Year'].astype(str), format='%Y-%m-%d')

# FRSHTT = Frost, Rain, Snow, Hail, Thunder, Thornado
weather_conditions['FRSHTT'] = weather_conditions['FRSHTT'].apply(lambda x: '{0:0>6}'.format(x))
weather_conditions['Frost'] = weather_conditions['FRSHTT'].str[0].astype(float)
weather_conditions['Rain'] = weather_conditions['FRSHTT'].str[1].astype(float)
weather_conditions['Snow'] = weather_conditions['FRSHTT'].str[2].astype(float)
weather_conditions['Hail'] = weather_conditions['FRSHTT'].str[3].astype(float)
weather_conditions['Thunder'] = weather_conditions['FRSHTT'].str[4].astype(float)
weather_conditions['Tornado'] = weather_conditions['FRSHTT'].str[5].astype(float)
weather_conditions.drop('FRSHTT',axis=1,inplace=True)

# verwijder uitschieters
weather_conditions = weather_conditions[weather_conditions['Min Temperature'] < 50]
weather_conditions = weather_conditions[weather_conditions['Max Temperature'] < 50]
weather_conditions = weather_conditions[weather_conditions['Temperature'] < 50]
weather_conditions = weather_conditions[weather_conditions['Windspeed'] < 50]
weather_conditions = weather_conditions[weather_conditions['Max Windspeed'] < 50]
weather_conditions = weather_conditions[weather_conditions['Dewpoint Temperature'] < 100]

# bereken gemiddelde eigenschappen per jaar
weather_conditions['Count'] = 1
average_weather_conditions_peryear = weather_conditions.groupby(weather_conditions['Year'].map(lambda x: x.year)).sum()
average_weather_conditions_peryear = average_weather_conditions_peryear[['Temperature','Dewpoint Temperature','Windspeed','Max Windspeed','Max Temperature','Min Temperature','Frost','Rain','Snow','Hail','Thunder','Tornado']].div(average_weather_conditions_peryear['Count'], axis=0)
average_weather_conditions_peryear.drop(pd.Int64Index([2018]), inplace=True) # 2018 heeft maar 1 meting

Unnamed: 0_level_0,Temperature,Temperature,Dewpoint Temperature,Dewpoint Temperature,Windspeed,Windspeed,Max Windspeed,Max Windspeed,Max Temperature,Max Temperature,...,Rain,Rain,Snow,Snow,Hail,Hail,Thunder,Thunder,Tornado,Tornado
Unnamed: 0_level_1,sum,count,sum,count,sum,count,sum,count,sum,count,...,sum,count,sum,count,sum,count,sum,count,sum,count
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1990,5770.944444,331,4008.777778,331,2183.4,331,4346.3,331,7344.0,331,...,112.0,331,0.0,331,0.0,331,22.0,331,0.0,331
1991,5504.333333,312,3168.5,312,2209.0,312,4447.4,312,7078.0,312,...,88.0,312,0.0,312,0.0,312,9.0,312,0.0,312
1992,5615.055556,337,3370.777778,337,2471.5,337,4800.7,337,7291.0,337,...,90.0,337,1.0,337,1.0,337,7.0,337,0.0,337
1993,5951.944444,363,4182.0,363,2484.0,363,4694.5,363,7528.0,363,...,126.0,363,1.0,363,1.0,363,11.0,363,2.0,363
1994,6170.055556,363,4421.0,363,2617.4,363,4914.4,363,7793.0,363,...,108.0,363,2.0,363,0.0,363,15.0,363,0.0,363
1995,6542.555556,363,4320.166667,363,2537.6,363,4932.9,363,8173.0,363,...,112.0,363,0.0,363,0.0,363,13.0,363,1.0,363
1996,6272.388889,366,4504.055556,366,2607.7,366,5117.6,366,7867.0,366,...,141.0,366,1.0,366,0.0,366,21.0,366,0.0,366
1997,6632.944444,364,4852.944444,364,2706.5,364,4857.6,364,8212.0,364,...,138.0,364,3.0,364,3.0,364,22.0,364,0.0,364
1998,6353.222222,361,4439.222222,361,2849.1,361,4894.6,361,8041.0,361,...,99.0,361,1.0,361,0.0,361,10.0,361,0.0,361
1999,6296.722222,362,4579.944444,362,2976.1,362,5030.5,362,7838.0,362,...,112.0,362,1.0,362,0.0,362,10.0,362,0.0,362


__MERGING EXTERNE EN INTERNE DATASETS__

In [10]:
redwines_year_comparison = redwines[['points']]
redwines_with_yeartemperatures = redwines_year_comparison.merge(average_weather_conditions_peryear, left_index=True, right_index=True)
redwines_with_yeartemperatures.head()



Unnamed: 0,points,"(Temperature, sum)","(Temperature, count)","(Dewpoint Temperature, sum)","(Dewpoint Temperature, count)","(Windspeed, sum)","(Windspeed, count)","(Max Windspeed, sum)","(Max Windspeed, count)","(Max Temperature, sum)",...,"(Rain, sum)","(Rain, count)","(Snow, sum)","(Snow, count)","(Hail, sum)","(Hail, count)","(Thunder, sum)","(Thunder, count)","(Tornado, sum)","(Tornado, count)"
1996,86,6272.388889,366,4504.055556,366,2607.7,366,5117.6,366,7867.0,...,141.0,366,1.0,366,0.0,366,21.0,366,0.0,366
2000,87,6159.388889,366,4295.388889,366,3079.0,366,5222.3,366,7792.0,...,124.0,366,2.0,366,1.0,366,12.0,366,0.0,366
2000,87,6159.388889,366,4295.388889,366,3079.0,366,5222.3,366,7792.0,...,124.0,366,2.0,366,1.0,366,12.0,366,0.0,366
2002,90,6160.277778,365,4643.055556,365,3086.3,365,5291.5,365,7704.0,...,133.0,365,0.0,365,0.0,365,16.0,365,0.0,365
2003,89,6298.222222,365,4392.0,365,3015.1,365,5353.0,365,7873.0,...,119.0,365,0.0,365,0.0,365,14.0,365,0.0,365


__CREATING THE MODEL__

In [11]:
# ['Temperature', 'Dewpoint Temperature','Windspeed','Max Windspeed','Max Temperature','Min Temperature','Frost','Rain','Snow','Hail','Thunder','Tornado']
# dependent and independent columns
dependent = redwines_with_yeartemperatures['points']
independent = redwines_with_yeartemperatures.drop('points', axis=1)

# test and train set
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, random_state=0, test_size=0.2)

# setting up and training model
model = LogisticRegression()
model.fit(X_train, y_train)

# testing and calculating accuracy
#print(model.predict(X_test))
print(model.score(X_test,y_test))

# LinearRegression 8%
# DecisionTreeClassifier 12%
# LogisticRegression 15%
# BayesianRidge 6%
# SVR 5%
# DecisionTreeRegressor 7%
# Ridge 5%



0.14320388349514562
