In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# os.chdir("Desktop")

In [3]:
df = pd.read_csv('data.csv')

In [4]:
#Keeping a copy of the original dataframe for reference 
df_new = df.copy()

In [5]:
#Visualizing the data
df.head()

Unnamed: 0.1,Unnamed: 0,x0_t0,x0_t1,x0_t2,x0_t3,x0_t4,x0_t5,x0_t6,x1_t0,x1_t1,...,x53_t6,x54_t0,x54_t1,x54_t2,x54_t3,x54_t4,x54_t5,x54_t6,y1,y2
0,0,763.134041,768.107435,764.607484,765.236945,765.775778,765.619062,750.95,0.789998,0.801386,...,87.2,2.03,2.03,2.03,2.03,2.03,2.03,2.03,0.087,0.58
1,1,765.446797,763.464843,755.205877,754.690499,762.872823,767.775947,750.95,0.792258,0.807701,...,87.2,2.03,2.03,2.03,2.03,2.03,2.03,2.03,0.089,0.59
2,2,765.973495,768.980362,769.442794,771.473454,774.054794,762.862527,750.95,0.825336,0.795842,...,87.2,2.368016,2.385767,2.432579,2.306701,2.297906,2.03,2.03,0.05,0.44
3,3,764.650659,765.90954,768.443158,767.678032,767.260418,769.040021,750.95,0.781667,0.788325,...,87.2,2.03,2.03,2.03,2.03,2.03,2.03,2.03,0.089,0.44
4,4,760.405529,766.774665,767.547864,766.582823,765.5173,769.685324,750.95,0.812201,0.825667,...,87.2,2.231677,2.174763,2.351654,2.220314,2.204783,2.03,2.03,0.08,0.59


In [6]:
#First let us check for missing values in the dataframe
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

Series([], dtype: float64)

In [7]:
#No missing values found, so no imputation required. 

In [8]:
#Storing our two target variables
y1= df["y1"]
y2= df["y2"]

In [9]:
#Dropping the necessary columns from the dataframe

df.drop(["y1","y2"],1,inplace=True) #Dropping the target variables, since we already saved them separately

df.drop(["Unnamed: 0"],1,inplace=True) #Dropping redundant index column

In [10]:
# Let us store the unique features - sensors (x0 to x54)

col_series = pd.Series(df.columns)
sensors = col_series.apply(lambda x:x.rsplit("_")).str.get(0).unique()

In [11]:
#Actual headers with the time tag (t0 to t6) removed
headers= col_series.apply(lambda x:x.rsplit("_")).str.get(0)

In [12]:
headers[:5]

0    x0
1    x0
2    x0
3    x0
4    x0
dtype: object

In [13]:
#Now let us get all the actual headers of the data frame
list1 = []
for x in range(0,len(col_series),7):
    list1.append([col_series[x],col_series[x+1],col_series[x+2],col_series[x+3],
                  col_series[x+4],col_series[x+5],col_series[x+6]])

In [14]:
all_real_headers= list1

In [15]:
#Store the batches as our rows variable
rows = np.arange(len(list(df.index)))

In [16]:
#Create a function that takes as its input a sensor and returns all data for that sensor- all batches and times

def time_series(sensor):
    s = df[sensor]
    list_df= []
    for row in rows:
        x0 = s.iloc[row].T
        list_df.append(x0)
    return pd.concat(list_df)

In [17]:
# Iterate through all 55 sensors and store the values in their respective headers
listers=[]
for x in range(0,len(col_series),7):
    listers.append(time_series([col_series[x],col_series[x+1],col_series[x+2],col_series[x+3],col_series[x+4],col_series[x+5],col_series[x+6]]))

In [18]:
listers[0]

x0_t0    763.134041
x0_t1    768.107435
x0_t2    764.607484
x0_t3    765.236945
x0_t4    765.775778
x0_t5    765.619062
x0_t6    750.950000
x0_t0    765.446797
x0_t1    763.464843
x0_t2    755.205877
x0_t3    754.690499
x0_t4    762.872823
x0_t5    767.775947
x0_t6    750.950000
x0_t0    765.973495
x0_t1    768.980362
x0_t2    769.442794
x0_t3    771.473454
x0_t4    774.054794
x0_t5    762.862527
x0_t6    750.950000
x0_t0    764.650659
x0_t1    765.909540
x0_t2    768.443158
x0_t3    767.678032
x0_t4    767.260418
x0_t5    769.040021
x0_t6    750.950000
x0_t0    760.405529
x0_t1    766.774665
            ...    
x0_t5    750.855744
x0_t6    750.950000
x0_t0    752.755783
x0_t1    752.484435
x0_t2    744.772873
x0_t3    756.993609
x0_t4    761.817725
x0_t5    747.048507
x0_t6    750.950000
x0_t0    749.662068
x0_t1    745.642215
x0_t2    745.705766
x0_t3    745.962520
x0_t4    747.266960
x0_t5    743.441518
x0_t6    748.291985
x0_t0    743.475161
x0_t1    748.555371
x0_t2    746.904789


In [19]:
#Create an empty dataframe
processedDf = pd.DataFrame()

In [20]:
for x in range(0,len(sensors)):
    processedDf[sensors[x]] = listers[x].values

In [21]:
#We have now converted our original dataframe into a time series dataset- where the time variable is in the index
processedDf.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x45,x46,x47,x48,x49,x50,x51,x52,x53,x54
0,763.134041,0.789998,1,22.132677,6.7,765.972043,841.0,829.099976,4.4,0.326108,...,0,89.77,0.5,698.35,140.75,14.62,866.57,659.85,87.2,2.03
1,768.107435,0.801386,1,22.068269,6.7,764.606257,841.0,829.099976,4.4,0.322163,...,0,89.77,0.5,698.35,140.75,14.62,866.57,659.85,87.2,2.03
2,764.607484,0.788737,1,21.815364,6.7,768.211539,841.0,829.099976,4.4,0.280555,...,0,89.77,0.5,698.35,140.75,14.62,866.57,659.85,87.2,2.03
3,765.236945,0.786225,1,21.9678,6.7,762.067313,841.0,829.099976,4.4,0.333311,...,0,89.77,0.5,698.35,140.75,14.62,866.57,659.85,87.2,2.03
4,765.775778,0.798978,1,21.988524,6.7,762.628059,841.0,829.099976,4.4,0.279611,...,0,89.77,0.5,698.35,140.75,14.62,866.57,659.85,87.2,2.03


In [22]:
#Converting our target variables
y1_new = pd.DataFrame()
list2 = []
for x in range(0,len(y1)):
    list2.append(y1[x])
    list2.append(y1[x])
    list2.append(y1[x])
    list2.append(y1[x])
    list2.append(y1[x])
    list2.append(y1[x])
    list2.append(y1[x])
y1_new = pd.Series( (v for v in list2) )

In [23]:
y2_new = pd.DataFrame()

list2 = []
for x in range(0,len(y2)):
    list2.append(y2[x])
    list2.append(y2[x])
    list2.append(y2[x])
    list2.append(y2[x])
    list2.append(y2[x])
    list2.append(y2[x])
    list2.append(y2[x])

y2_new = pd.Series( (v for v in list2) )

In [24]:
#This concludes our preprocessing
#We now have our preprocessed feature matrix as well as our target variables

In [25]:
#Task 1: Achieve R2 score higher than 0.5 for label y1. You can shuffle the data while train-test splitting.

In [26]:
#Split the data into train and test sets

from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(processedDf, y1_new,test_size= 0.05, random_state=42)

In [27]:
#We will try linear regression, support vector machines, random forest and XGBoost algorithms

In [28]:
#Linear Regression

from sklearn.linear_model import LinearRegression
lreg = LinearRegression()

In [29]:
lreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [30]:
lreg_pred= lreg.predict(X_test)

In [31]:
from sklearn.metrics import r2_score

In [32]:
r2_score(y_test, lreg_pred) 

0.2694057413949932

In [33]:
#0.27 r2 score accuracy with linear regression

In [34]:
from sklearn.svm import SVR

In [35]:
svr= SVR()

In [36]:
svr.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [37]:
svr_pred= svr.predict(X_test)

In [38]:
r2_score(y_test, svr_pred)

-0.4266905879581695

In [39]:
#Negative values so this model is not compatible

In [40]:
#XGBoost

from xgboost import XGBRegressor

In [41]:
xgb= XGBRegressor()

In [42]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [43]:
xgb_pred= xgb.predict(X_test)

In [44]:
r2_score(y_test,xgb_pred)

0.4996101029105142

In [45]:
#0.5 r2 score with XGBoost. However we are aiming for higher than this.

In [66]:
#Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor
rf= RandomForestRegressor(n_estimators=50,min_samples_split=5,random_state=1)

In [67]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [68]:
rf_pred= rf.predict(X_test)

In [69]:
r2_score(y_test, rf_pred)

0.6676067198792742

In [55]:
#Random Forest provides us with the highest r2 so far- 0.66

In [56]:
#Let us try to improve upon our random forest by feature selection

In [70]:
#Finding the important features

importances_rf = pd.Series(rf.feature_importances_,
                          index = X_train.columns)
sorted_importances = importances_rf.sort_values(ascending=False)
new_features= sorted_importances.head(20)

In [71]:
new_features

x35    0.192565
x11    0.077788
x7     0.076969
x12    0.070226
x1     0.068910
x14    0.053778
x8     0.043891
x5     0.032232
x10    0.028744
x13    0.027155
x3     0.020508
x50    0.019410
x0     0.018340
x9     0.018161
x30    0.017959
x52    0.016147
x4     0.014924
x46    0.013883
x6     0.013516
x33    0.013337
dtype: float64

In [72]:
#Now let us retrain our model using only these top 20 features that have the highest importance
#If it improves results, we will condense the feature matrix to these 20 features only.

In [73]:
rf= RandomForestRegressor(n_estimators=50,min_samples_split=5,random_state=1)
rf.fit(X_train[new_features.index], y_train)
y_rf= rf.predict(X_test[new_features.index])
r2_score(y_test, y_rf)

0.6923289890743989

In [74]:
#We see a small improvement in the model using just the top 20 features.

In [75]:
#Let us also try to see if our XGBoost model improves with a smaller set of features

In [76]:
xgb= XGBRegressor()
xgb.fit(X_train[new_features.index], y_train)
y_xgb= xgb.predict(X_test[new_features.index])
r2_score(y_test, y_xgb)

0.49331562019942143

In [77]:
#Our XGBoost does not improve. Random forest still has the highest r2 score with 0.68.

In [78]:
#Therefore for Task 1 our final solution is a Random Forest Regressor trained on 20/55 of the original features

In [79]:
#Task 2: Achieve the highest possible R2 score for label y1. 
#You can NOT shuffle the data while train-test splitting.

In [80]:
#We will use our random forest regressor except this time, we will preserve the order of our original dataset

In [81]:
X_train, X_test, y_train, y_test = tts(processedDf, y1_new, test_size=0.05, random_state=0)

In [104]:
rf= RandomForestRegressor(n_estimators=100,min_samples_split=5,random_state=0)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [105]:
y_pred_rf= rf.predict(X_test)

In [106]:
r2_score(y_test, y_pred_rf)

0.5631211055863359

In [107]:
#Random Forest gives us 0.56 r2 score on the full dataset. Now lets try with only the top 20 features

In [108]:
importances_rf = pd.Series(rf.feature_importances_,
                          index = X_train.columns)
sorted_importances = importances_rf.sort_values(ascending=False)
new_features= sorted_importances.head(20)

In [109]:
rf= RandomForestRegressor(n_estimators=100,min_samples_split=5,random_state=0)
rf.fit(X_train[new_features.index], y_train)
y_rf= rf.predict(X_test[new_features.index])
r2_score(y_test, y_rf)

0.5903553120191563

In [110]:
#Using the feature subset gives us an r2 score of 0.6- small improvement. 

In [111]:
#Let us also try with XGBoost to see if preserving the dataset order improves r2 when modelling
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [112]:
xg_pred= xgb.predict(X_test)

In [113]:
r2_score(y_test, xg_pred)

0.3896576327388569

In [114]:
#0.39 score- so random forest is still much higher
#Therefore for Task 2 our Random Forest Regressor on a subset of 20 features gives us an r2 of 0.6

In [115]:
#Task 3: Achieve the highest possible R2 score for label y2. 
#You can NOT shuffle the data while train-test splitting.

In [116]:
#Train test split with the new output variable y2
X_train, X_test, y_train, y_test = tts(processedDf, y2_new, test_size=0.05, random_state=0)

In [117]:
#Lets begin with random forest 
rf= RandomForestRegressor(n_estimators=100,min_samples_split=5,random_state=0)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [118]:
y_rf= rf.predict(X_test)

In [119]:
r2_score(y_test, y_rf)

0.7153724210489585

In [120]:
#0.7 r2 score achieved with Random forest. Let us now try the feature selection approach here too. 

In [121]:
importances_rf = pd.Series(rf.feature_importances_,
                          index = X_train.columns)
sorted_importances = importances_rf.sort_values(ascending=False)
new_features= sorted_importances.head(20)

In [123]:
rf= RandomForestRegressor(n_estimators=100,min_samples_split=5,random_state=0)
rf.fit(X_train[new_features.index], y_train)
y_rf= rf.predict(X_test[new_features.index])
r2_score(y_test, y_rf)

0.7233042142512311

In [124]:
#Minor improvement in the r2 score with selecting only the most important 20 features. 

In [125]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [126]:
xg_pred= xgb.predict(X_test)

In [127]:
r2_score(y_test, xg_pred)

0.5203801675224408

In [128]:
#Let us see if using only the most important features improves our XGBoost model

In [129]:
xgb= XGBRegressor()
xgb.fit(X_train[new_features.index], y_train)
y_xgb= xgb.predict(X_test[new_features.index])
r2_score(y_test, y_xgb)

0.5748832760391441

In [130]:
#For task 3, our highest r2 score is 0.71 using a Random Forest Regressor with only top 20 features 