# ISA 414 - Managing Big Data
## Lecture 12 – Data Analysis
### Supervised Learning (Part I)

#### Slide 18: Make sure you install the *sklearn* and *pandas* modules first by running **pip install sklearn** and **pip install pandas** in the Terminal.

In [8]:
import pandas as pd

energy_data = pd.read_csv("energy_data.csv")

#### Slide 19

In [9]:
# checking the attribute types
energy_data.dtypes

# one can change the type of a column (series) to, say, int using the code below:
# energy_data['column_name'] = energy_data['column_name'].astype(int') 

Age                         int64
MaritalStatus              object
AnnualConsumption           int64
DayNightConsumption       float64
IncomeLevel                object
DwellingArea               object
HasChildren                object
SolarRoof                  object
ShiftableLoad              object
AttitudeSustainability     object
Tariff                     object
dtype: object

#### Slide 20

In [10]:
# drop_first remove the first column since it is redundant
energy_data = pd.get_dummies(energy_data, 
                         columns = ["MaritalStatus", "IncomeLevel", "DwellingArea", "HasChildren", 
                                    "SolarRoof", "ShiftableLoad", "AttitudeSustainability" ], 
                         drop_first = True)                  

In [11]:
energy_data

Unnamed: 0,Age,AnnualConsumption,DayNightConsumption,Tariff,MaritalStatus_'2',MaritalStatus_'3',MaritalStatus_'4',MaritalStatus_'5',MaritalStatus_'6',IncomeLevel_'2',...,DwellingArea_'2',DwellingArea_'3',DwellingArea_'4',HasChildren_'TRUE',SolarRoof_'TRUE',ShiftableLoad_'2',ShiftableLoad_'3',ShiftableLoad_'4',AttitudeSustainability_'2',AttitudeSustainability_'3'
0,60,4420,0.811033,'Tariff 3',0,0,1,0,0,0,...,0,1,0,0,1,0,1,0,0,0
1,43,4127,0.464281,'Tariff 1',0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,64,4564,0.785733,'Tariff 2',0,0,0,0,1,1,...,0,0,1,0,1,1,0,0,1,0
3,44,3689,0.618946,'Tariff 1',0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,1
4,19,5311,0.281707,'Tariff 1',0,0,0,1,0,1,...,0,0,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,33,2514,0.713894,'Tariff 1',0,0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,0
996,60,4766,0.389425,'Tariff 2',0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,1
997,55,4360,0.444445,'Tariff 3',0,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,1
998,60,5345,0.907093,'Tariff 1',0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0


#### Slide 21

In [12]:
from sklearn import preprocessing

# create the encoding
enc = preprocessing.LabelEncoder()

# apply the encoding
energy_data["Tariff"] = enc.fit_transform(energy_data["Tariff"])

# checking attribute types again
energy_data.dtypes

Age                             int64
AnnualConsumption               int64
DayNightConsumption           float64
Tariff                          int64
MaritalStatus_'2'               uint8
MaritalStatus_'3'               uint8
MaritalStatus_'4'               uint8
MaritalStatus_'5'               uint8
MaritalStatus_'6'               uint8
IncomeLevel_'2'                 uint8
IncomeLevel_'3'                 uint8
IncomeLevel_'4'                 uint8
IncomeLevel_'5'                 uint8
DwellingArea_'2'                uint8
DwellingArea_'3'                uint8
DwellingArea_'4'                uint8
HasChildren_'TRUE'              uint8
SolarRoof_'TRUE'                uint8
ShiftableLoad_'2'               uint8
ShiftableLoad_'3'               uint8
ShiftableLoad_'4'               uint8
AttitudeSustainability_'2'      uint8
AttitudeSustainability_'3'      uint8
dtype: object

#### Slide 22

In [13]:
from sklearn.tree import DecisionTreeClassifier

# the drop function returns all the columns in a data frame, except the specified ones
x = energy_data.drop(columns=["Tariff"])
y = energy_data["Tariff"]

# create decision tree classifer object
model = DecisionTreeClassifier()

# train Decision Tree classifer
model = model.fit(x,y)


#### Slide 26

In [15]:
from sklearn.model_selection import train_test_split

# 66% training and 34% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.34)

In [16]:
# create decision tree classifer object
model = DecisionTreeClassifier()

# train Decision Tree classifer
model = model.fit(x_train,y_train)

#### Slide 27

In [17]:
from sklearn import metrics

# predicting tariffs on the test set
y_pred = model.predict(x_test)

# printing overall accuracy on the test set
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6558823529411765


#### Slide 32

In [18]:
from sklearn.ensemble import RandomForestClassifier

# creating a random forest
model = RandomForestClassifier(n_estimators=200)

# fittig the model
model = model.fit(x_train,y_train)

# predicting tariffs on the test set
y_pred = model.predict(x_test)

# printing overall accuracy on the test set
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6441176470588236
