## Manual Updates to file:
- Convert excel file with three tabs to 3 CSV files
- Put dependent and independent variables in same file.
- Update column names for easier modelling
    - Replace space with _ to ensure easier readability

## Data Preprocessing
1. Convert all -1 to 0.
2. Remove Skewness from the data

## Data Analysis:
1. Check number of unique values
2. Check if missing values
3. Check Std. Deviation, min, max, quantiles etc

## Variable Description
    Var 1	Integer (min=0, no max)
    Var 2	Real (min=0, no max)
    Var 3	Integer (min=0, no max)
    Var 4	Real (Negative value possible though unlikely, no max)
    Var 5	One of 8 classes (so -1 here means not in that class)
    Var 6	One of 12 classes (so -1 here means not in that class)	
    Var 7	Integer
    Var 8	Integer
    Var 9	Integer
    Var 10	Integer
    Var 11	Real (no min or max)
    Var 12	Real (no min or max)
    Var 13	Integer (min=2, no max)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# To plot QQ plot
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

In [2]:
# To view all the columns 
pd.set_option("display.max_columns", None)

In [3]:
data = pd.read_csv('Input_Set_1.csv')

In [4]:
data.head()

Unnamed: 0,Var_1,Var_2,Var_3,Var_4,Var_5_Class_1,Var_5_Class_2,Var_5_Class_3,Var_5_Class_4,Var_5_Class_5,Var_5_Class_6,Var_5_Class_7,Var_5_Class_8,Var_6_Class_1,Var_6_Class_2,Var_6_Class_3,Var_6_Class_4,Var_6_Class_5,Var_6_Class_6,Var_6_Class_7,Var_6_Class_8,Var_6_Class_9,Var_6_Class_10,Var_6_Class_11,Var_6_Class_12,Var_7,Var_8,Var_9,Var_10,Var_11,Var_12,Var_13,Input_Data_Set_1
0,5,0.93306,63,15.56694,-1,-1,-1,-1,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,0,-1,-1,39.737157,28.386285,2,1
1,8,2.124317,18,14.375683,-1,-1,-1,-1,-1,-1,1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1,39.737157,28.386285,2,0
2,18,1.618852,53,14.881148,-1,-1,-1,-1,-1,-1,1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,0,-1,-1,39.737157,28.386285,2,1
3,11,1.005464,12,15.494536,-1,-1,-1,-1,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,0,-1,-1,39.737157,28.386285,2,1
4,12,1.008197,11,14.27459,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,0,-1,-1,39.737157,28.386285,2,1


## Data Analysis

In [5]:
# Check shape of the data
data.shape

(1649, 32)

In [6]:
# Check how many unique values are there is the data. Helps to understand which column can be set as categorical vs numeric
data.nunique()

Var_1                 52
Var_2               1007
Var_3                 85
Var_4               1379
Var_5_Class_1          2
Var_5_Class_2          2
Var_5_Class_3          2
Var_5_Class_4          2
Var_5_Class_5          1
Var_5_Class_6          2
Var_5_Class_7          2
Var_5_Class_8          2
Var_6_Class_1          2
Var_6_Class_2          2
Var_6_Class_3          2
Var_6_Class_4          2
Var_6_Class_5          2
Var_6_Class_6          2
Var_6_Class_7          2
Var_6_Class_8          2
Var_6_Class_9          2
Var_6_Class_10         2
Var_6_Class_11         2
Var_6_Class_12         2
Var_7                  6
Var_8                 13
Var_9                  2
Var_10                 2
Var_11                15
Var_12                15
Var_13                15
Input_Data_Set_1       2
dtype: int64

In [7]:
# Check if there are any null values
data.isnull().sum()

Var_1               0
Var_2               0
Var_3               0
Var_4               0
Var_5_Class_1       0
Var_5_Class_2       0
Var_5_Class_3       0
Var_5_Class_4       0
Var_5_Class_5       0
Var_5_Class_6       0
Var_5_Class_7       0
Var_5_Class_8       0
Var_6_Class_1       0
Var_6_Class_2       0
Var_6_Class_3       0
Var_6_Class_4       0
Var_6_Class_5       0
Var_6_Class_6       0
Var_6_Class_7       0
Var_6_Class_8       0
Var_6_Class_9       0
Var_6_Class_10      0
Var_6_Class_11      0
Var_6_Class_12      0
Var_7               0
Var_8               0
Var_9               0
Var_10              0
Var_11              0
Var_12              0
Var_13              0
Input_Data_Set_1    0
dtype: int64

## Data Preprocessing

We need to convert all -1 to 0 to help machine understand it better

In [8]:
# Saving all the columns which have -1 and 0 values in data_columns variable
data_columns = ['Var_5_Class_1', 'Var_5_Class_2','Var_5_Class_3', 'Var_5_Class_4', 'Var_5_Class_5', 'Var_5_Class_6','Var_5_Class_7', 'Var_5_Class_8', 'Var_6_Class_1', 'Var_6_Class_2', 'Var_6_Class_3', 'Var_6_Class_4', 'Var_6_Class_5', 'Var_6_Class_6','Var_6_Class_7', 'Var_6_Class_8', 'Var_6_Class_9', 'Var_6_Class_10','Var_6_Class_11', 'Var_6_Class_12', 'Var_9', 'Var_10']

In [9]:
# Creating a function to convert all -1 to 0s

def convert_to_zero(df):
    for i in data_columns:
        for j in range(len(df)):
            if (df[i][j] == -1):
                df[i][j] = 0
            

In [10]:
convert_to_zero(data)

## Oversample minority class

In [11]:
data[data.Input_Data_Set_1==0].shape, data[data.Input_Data_Set_1==1].shape

((141, 32), (1508, 32))

In [12]:
from sklearn.utils import resample

# separate minority and majority classes
not_accept = data[data.Input_Data_Set_1==0]
accept = data[data.Input_Data_Set_1==1]

# upsample minority
not_accept_upsampled = resample(not_accept,
                          replace=True, # sample with replacement
                          n_samples=len(accept), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
data = pd.concat([not_accept_upsampled, accept])

In [13]:
data[data.Input_Data_Set_1==0].shape, data[data.Input_Data_Set_1==1].shape

((1508, 32), (1508, 32))

In [14]:
# Saving the preprocessed data in a seperate csv file so that we don't have to repeat above steps multiple times
data.to_csv("Input_Set_1_upsample.csv", index=False)

## Preprocess the data

In [15]:
# Capturing the cleaned data
data = pd.read_csv('Input_Set_1_upsample.csv')

In [16]:
data.shape

(3016, 32)

In [17]:
data_numeric = ['Var_1', 'Var_2', 'Var_3', 'Var_4', 'Var_7', 'Var_8', 'Var_11', 'Var_12', 'Var_13']

-0.5 to 0.5 -> Symmetric </br>
Less than -0.5 -> Negatively Skewed </br>
More then 0.5 -> Positively Skewed </br></br>


In [18]:
# Function to idenfity skewness
def data_skewness(df):
    for i in data_numeric:
        print(i, ":  ", df[i].skew())

In [19]:
data_skewness(data)

Var_1 :   1.027241434363162
Var_2 :   1.3818735456853921
Var_3 :   7.0346142907058065
Var_4 :   -1.902955030278428
Var_7 :   5.075627117627829
Var_8 :   2.732960748022389
Var_11 :   -1.116088537753502
Var_12 :   -1.0958683660079687
Var_13 :   -0.5421474178256848


In [20]:
# Removing skewness. Taking root for positive skewed data and power for negatively skewed data
data["Var_1"] = np.sqrt(data["Var_1"])
data["Var_2"] = np.sqrt(data["Var_2"])
data["Var_3"] = np.cbrt(data["Var_3"])
data["Var_7"] = np.cbrt(data["Var_7"])
data["Var_8"] = np.cbrt(data["Var_8"])

In [21]:

data["Var_11"] = np.power(data["Var_11"], 5)
data["Var_4"] = np.power(data["Var_4"], 6)

In [22]:
data_skewness(data)

Var_1 :   -0.635583238338176
Var_2 :   0.6540856233795456
Var_3 :   0.31394838153338317
Var_4 :   -0.2782315762868648
Var_7 :   3.643283281249254
Var_8 :   0.7158652674997894
Var_11 :   0.5092978371936222
Var_12 :   -1.0958683660079687
Var_13 :   -0.5421474178256848


# Model Building
We are building RandomForest model as it is less impacted by outliers

In [23]:
# After model building, we identified various features which were less important to model building. So we removed those. 

model_data = data.drop(['Var_5_Class_1', 'Var_5_Class_3', 'Var_5_Class_4', 'Var_5_Class_5', 'Var_10'], axis=1)

We will be using two ways to test out model:
- Train test split
- Cross Validation

In [24]:
# Segregating data in independent and dependent variables

X = model_data.drop(['Input_Data_Set_1'], axis=1)
y = model_data['Input_Data_Set_1']

In [25]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=122, stratify = y)

In [26]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean


# Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=10, random_state=22)

In [28]:
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(rf_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize performance
print('Accuracy %.3f' % mean(scores))

Accuracy 0.979


In [29]:
# Fitting a single model using train test split
rf_model.fit(train_X, train_y)

rf_predicted_test = rf_model.predict(test_X)
accuracy_rf = accuracy_score(rf_predicted_test, test_y)
accuracy_rf

0.9840848806366048

In [30]:
# Checking feature importance to identify features that can be eliminated.
feature_importance_rf = pd.Series(rf_model.feature_importances_, index=train_X.columns)

feature_importance_rf = feature_importance_rf.sort_values(ascending=False)

feature_importance_rf

Var_2             0.136398
Var_4             0.135844
Var_3             0.129917
Var_13            0.091716
Var_1             0.090806
Var_12            0.060937
Var_11            0.057764
Var_8             0.057019
Var_9             0.029804
Var_6_Class_8     0.019424
Var_7             0.018991
Var_6_Class_1     0.018056
Var_5_Class_8     0.017751
Var_6_Class_11    0.016723
Var_6_Class_2     0.015615
Var_5_Class_7     0.014671
Var_6_Class_7     0.013406
Var_6_Class_10    0.011416
Var_6_Class_9     0.010648
Var_6_Class_6     0.010367
Var_6_Class_5     0.010324
Var_6_Class_3     0.009442
Var_6_Class_4     0.007935
Var_6_Class_12    0.007467
Var_5_Class_2     0.005333
Var_5_Class_6     0.002228
dtype: float64

## Prediction
model_name = rf_model

In [83]:
predicted_data = pd.read_csv("v2_Inputs_for_Full_Predictions_1_imputed.csv")

In [84]:
data_skewness(predicted_data)

Var_1 :   3.9019749056127604
Var_2 :   1.2858898594763082
Var_3 :   18.706620448485587
Var_4 :   -1.2845748252297844
Var_7 :   40.15172939399318
Var_8 :   10.27761779642971
Var_11 :   -1.1923984299476802
Var_12 :   -0.1738235400552654
Var_13 :   -0.13667133273249232


In [85]:
# Removing skewness. Taking root for positive skewed data and power for negatively skewed data
predicted_data["Var_1"] = np.sqrt(predicted_data["Var_1"])
predicted_data["Var_2"] = np.sqrt(predicted_data["Var_2"])
predicted_data["Var_3"] = np.cbrt(predicted_data["Var_3"])
predicted_data["Var_7"] = np.cbrt(predicted_data["Var_7"])
predicted_data["Var_8"] = np.cbrt(predicted_data["Var_8"])
predicted_data["Var_11"] = np.power(predicted_data["Var_11"], 5)
#predicted_data["Var_4"] = np.power(predicted_data["Var_4"], 6)

In [86]:
data_skewness(predicted_data)

Var_1 :   0.49872883873460444
Var_2 :   0.3576557463036899
Var_3 :   0.9441237516931893
Var_4 :   -1.2845748252297844
Var_7 :   3.992715938235575
Var_8 :   0.4007817435375751
Var_11 :   -1.1991892847336831
Var_12 :   -0.1738235400552654
Var_13 :   -0.13667133273249232


In [87]:
# After model building, we identified various features which were less important to model building. So we removed those. 

model_data = predicted_data.drop(['Var_5_Class_1', 'Var_5_Class_3', 'Var_5_Class_4', 'Var_5_Class_5', 'Var_10'], axis=1)

In [88]:
predictions = rf_model.predict(model_data)

In [89]:
predictions_probability = rf_model.predict_proba(model_data)

In [90]:
predictions_probability_0 = predictions_probability[:,0]

In [91]:
predictions_probability_1 = predictions_probability[:,1]

In [92]:
predictions_probability_1[:10]

array([0.6, 0.5, 0.7, 0.6, 1. , 0.7, 0.9, 0.9, 0.8, 0.8])

In [93]:
prediction_confidence = []
for i in predictions_probability_0:
    if (i > 0.7):
        prediction_confidence.append(0)
    elif (i < 0.3):
        prediction_confidence.append(100)
    else:
        prediction_confidence.append(50)
        


In [94]:
predictions_probability_0[30:40]

array([0.4, 0.4, 0.5, 0.4, 0.5, 0.3, 0.3, 0.5, 0.2, 0.1])

In [95]:
output_file_df = pd.DataFrame({
    "DataSet": 1,
    "PredictionSet": 1,
    "Prediction": predictions,
   # "Pred_proba_0": predictions_probability_0,
   # "Pred_proba_1": predictions_probability_1,
    "Confidence": prediction_confidence
})

In [96]:
output_file_df.to_csv("Final_File_DS1_Pred1.csv", index=False)