## Manual Updates to file:
- Convert excel file with three tabs to 3 CSV files
- Put dependent and independent variables in same file.
- Update column names for easier modelling
    - Replace space with _ to ensure easier readability

## Data Preprocessing
1. Convert all -1 to 0.
2. Remove Skewness from the data

## Data Analysis:
1. Check number of unique values
2. Check if missing values
3. Check Std. Deviation, min, max, quantiles etc

## Variable Description
    Var 1	Integer (min=0, no max)
    Var 2	Real (min=0, no max)
    Var 3	Integer (min=0, no max)
    Var 4	Real (Negative value possible though unlikely, no max)
    Var 5	One of 8 classes (so -1 here means not in that class)
    Var 6	One of 12 classes (so -1 here means not in that class)	
    Var 7	Integer
    Var 8	Integer
    Var 9	Integer
    Var 10	Integer
    Var 11	Real (no min or max)
    Var 12	Real (no min or max)
    Var 13	Integer (min=2, no max)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# To plot QQ plot
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

In [2]:
# To view all the columns 
pd.set_option("display.max_columns", None)

In [3]:
data = pd.read_csv('Input_Set_model_3.csv')

In [4]:
data.head()

Unnamed: 0,Var_1,Var_2,Var_3,Var_4,Var_5_Class_1,Var_5_Class_2,Var_5_Class_3,Var_5_Class_4,Var_5_Class_5,Var_5_Class_6,Var_5_Class_7,Var_5_Class_8,Var_6_Class_1,Var_6_Class_2,Var_6_Class_3,Var_6_Class_4,Var_6_Class_5,Var_6_Class_6,Var_6_Class_7,Var_6_Class_8,Var_6_Class_9,Var_6_Class_10,Var_6_Class_11,Var_6_Class_12,Var_7,Var_8,Var_9,Var_10,Var_11,Var_12,Var_13,Input_Data_Set_3
0,5,0.93306,63,7.56694,-1,-1,-1,-1,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,0,-1,-1,15.133771,16.395182,2,1.0
1,8,2.124317,18,6.375683,-1,-1,-1,-1,-1,-1,1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1,15.133771,16.395182,2,0.0
2,18,1.618852,53,6.881148,-1,-1,-1,-1,-1,-1,1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1,15.133771,16.395182,2,0.0
3,11,1.005464,12,7.494536,-1,-1,-1,-1,-1,-1,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,0,-1,-1,15.133771,16.395182,2,0.0
4,12,1.008197,11,6.27459,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,0,-1,-1,15.133771,16.395182,2,1.0


## Data Analysis

In [5]:
# Check shape of the data
data.shape

(651, 32)

In [6]:
# Check how many unique values are there is the data. Helps to understand which column can be set as categorical vs numeric
data.nunique()

Var_1                52
Var_2               526
Var_3                83
Var_4               573
Var_5_Class_1         2
Var_5_Class_2         2
Var_5_Class_3         2
Var_5_Class_4         2
Var_5_Class_5         1
Var_5_Class_6         2
Var_5_Class_7         2
Var_5_Class_8         2
Var_6_Class_1         2
Var_6_Class_2         2
Var_6_Class_3         2
Var_6_Class_4         2
Var_6_Class_5         2
Var_6_Class_6         2
Var_6_Class_7         2
Var_6_Class_8         2
Var_6_Class_9         2
Var_6_Class_10        2
Var_6_Class_11        2
Var_6_Class_12        2
Var_7                 4
Var_8                14
Var_9                 2
Var_10                2
Var_11                8
Var_12                9
Var_13                8
Input_Data_Set_3      2
dtype: int64

In [7]:
data.shape

(651, 32)

In [8]:
# Check if there are any null values
data.isnull().sum()

Var_1                0
Var_2                0
Var_3                0
Var_4                0
Var_5_Class_1        0
Var_5_Class_2        0
Var_5_Class_3        0
Var_5_Class_4        0
Var_5_Class_5        0
Var_5_Class_6        0
Var_5_Class_7        0
Var_5_Class_8        0
Var_6_Class_1        0
Var_6_Class_2        0
Var_6_Class_3        0
Var_6_Class_4        0
Var_6_Class_5        0
Var_6_Class_6        0
Var_6_Class_7        0
Var_6_Class_8        0
Var_6_Class_9        0
Var_6_Class_10       0
Var_6_Class_11       0
Var_6_Class_12       0
Var_7                0
Var_8                0
Var_9                0
Var_10               0
Var_11              14
Var_12               0
Var_13               0
Input_Data_Set_3     1
dtype: int64

In [9]:
# Drop these columns. Ideally we should replace these columns with mean median or mode. But don't have additional information for this
data.drop(data[pd.isna(data.Var_11)].index, inplace=True)

In [10]:
data.shape

(637, 32)

## Data Preprocessing

We need to convert all -1 to 0 to help machine understand it better

In [11]:
# Saving all the columns which have -1 and 0 values in data_columns variable
data_columns = ['Var_5_Class_1', 'Var_5_Class_2','Var_5_Class_3', 'Var_5_Class_4', 'Var_5_Class_5', 'Var_5_Class_6','Var_5_Class_7', 'Var_5_Class_8', 'Var_6_Class_1', 'Var_6_Class_2', 'Var_6_Class_3', 'Var_6_Class_4', 'Var_6_Class_5', 'Var_6_Class_6','Var_6_Class_7', 'Var_6_Class_8', 'Var_6_Class_9', 'Var_6_Class_10','Var_6_Class_11', 'Var_6_Class_12', 'Var_9', 'Var_10']

In [12]:
# Creating a function to convert all -1 to 0s

def convert_to_zero(df):
    for i in data_columns:
        for j in range(len(df)):
            if (df[i][j] == -1):
                df[i][j] = 0
            

In [13]:
convert_to_zero(data)

In [14]:
data.Input_Data_Set_3.value_counts()

1.0    342
0.0    295
Name: Input_Data_Set_3, dtype: int64

## Oversample minority class

In [15]:
data[data.Input_Data_Set_3==0].shape, data[data.Input_Data_Set_3==1].shape

((295, 32), (342, 32))

In [16]:
from sklearn.utils import resample

# separate minority and majority classes
not_accept = data[data.Input_Data_Set_3==0]
accept = data[data.Input_Data_Set_3==1]

# upsample minority
not_accept_upsampled = resample(not_accept,
                          replace=True, # sample with replacement
                          n_samples=len(accept), # match number in majority class
                          #n_samples=2500000,
                          random_state=27) # reproducible results

# combine majority and upsampled minority
data = pd.concat([not_accept_upsampled, accept])

In [17]:
data[data.Input_Data_Set_3==0].shape, data[data.Input_Data_Set_3==1].shape

((342, 32), (342, 32))

In [18]:
# Saving the preprocessed data in a seperate csv file so that we don't have to repeat above steps multiple times
data.to_csv("Input_Set_3_upsample.csv", index=False)

## Preprocess the data

In [19]:
# Capturing the cleaned data
data = pd.read_csv('Input_Set_3_upsample.csv')

In [20]:
data.shape

(684, 32)

In [21]:
# Creating a variable for numeric data
data_numeric = ['Var_1', 'Var_2', 'Var_3', 'Var_4', 'Var_7', 'Var_8', 'Var_11', 'Var_12', 'Var_13']

-0.5 to 0.5 -> Symmetric </br>
Less than -0.5 -> Negatively Skewed </br>
More then 0.5 -> Positively Skewed </br></br>


In [22]:
# Function to idenfity skewness
def data_skewness(df):
    for i in data_numeric:
        print(i, ":  ", df[i].skew())

In [23]:
data_skewness(data)

Var_1 :   1.4259445663557886
Var_2 :   1.2619211285902883
Var_3 :   4.5776283593422145
Var_4 :   -2.001079314276153
Var_7 :   3.210098082179848
Var_8 :   3.2999263026221857
Var_11 :   0.2524942974474454
Var_12 :   0.267850340485727
Var_13 :   -0.027995914651378892


In [24]:
# Removing skewness. Taking root for positive skewed data and power for negatively skewed data
data["Var_1"] = np.sqrt(data["Var_1"])
data["Var_2"] = np.sqrt(data["Var_2"])
data["Var_3"] = np.cbrt(data["Var_3"])
data["Var_7"] = np.cbrt(data["Var_7"])
data["Var_8"] = np.cbrt(data["Var_8"])
data["Var_12"] = np.power(data["Var_12"], 2)
data["Var_4"] = np.power(data["Var_4"], 6)

In [25]:
data_skewness(data)

Var_1 :   0.15287721840159382
Var_2 :   0.5543573175245293
Var_3 :   0.5129061407536686
Var_4 :   0.5815776812037916
Var_7 :   2.6219113083699614
Var_8 :   -0.18364151819323887
Var_11 :   0.2524942974474454
Var_12 :   0.5746641537484819
Var_13 :   -0.027995914651378892


# Model Building
We are building RandomForest model as it is less impacted by outliers

In [26]:
# Saving data so far in a seperate DF
model_data = data

In [27]:
# After model building, we identified various features which were less important to model building. So we removed those. 
model_data = data.drop(['Var_5_Class_5', 'Var_5_Class_1', 'Var_5_Class_4'], axis=1)

In [28]:
# Segregating data in independent and dependent variables

X = model_data.drop(['Input_Data_Set_3'], axis=1)
y = model_data['Input_Data_Set_3']

We will be using two ways to test out model:
- Train test split
- Cross Validation

In [29]:

from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=122, stratify = y)

In [30]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean


# Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=10, random_state=2)

In [32]:
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(rf_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize performance
print('Accuracy %.3f' % mean(scores))

Accuracy 0.745


In [33]:
# Fitting a single model using train test split
rf_model.fit(train_X, train_y)

rf_predicted_test = rf_model.predict(test_X)
accuracy_rf = accuracy_score(rf_predicted_test, test_y)
accuracy_rf

0.7134502923976608

In [34]:
# Checking feature importance to identify features that can be eliminated.
feature_importance_rf = pd.Series(rf_model.feature_importances_, index=train_X.columns)

feature_importance_rf = feature_importance_rf.sort_values(ascending=False)

feature_importance_rf

Var_2             0.136528
Var_3             0.120230
Var_1             0.111470
Var_4             0.098735
Var_8             0.080266
Var_13            0.061487
Var_9             0.055063
Var_10            0.053698
Var_12            0.039379
Var_11            0.034413
Var_5_Class_8     0.021901
Var_7             0.021316
Var_6_Class_2     0.016317
Var_6_Class_12    0.015967
Var_6_Class_7     0.014912
Var_6_Class_1     0.014163
Var_6_Class_9     0.012657
Var_5_Class_7     0.012188
Var_6_Class_5     0.011814
Var_6_Class_8     0.011190
Var_6_Class_4     0.010416
Var_6_Class_11    0.009739
Var_6_Class_3     0.008786
Var_6_Class_6     0.008276
Var_6_Class_10    0.007192
Var_5_Class_2     0.005702
Var_5_Class_6     0.005215
Var_5_Class_3     0.000981
dtype: float64

## Prediction
model_name = rf_model

In [38]:
predicted_data = pd.read_csv("v2_Inputs_for_Full_Predictions_3_imputed.csv")

In [39]:
data_skewness(predicted_data)

Var_1 :   3.9019749056127604
Var_2 :   1.2858898594763082
Var_3 :   18.706620448485587
Var_4 :   -1.284574825229781
Var_7 :   39.19867678667724
Var_8 :   9.854938149687225
Var_11 :   2.131511019460773
Var_12 :   -1.5937560903840868
Var_13 :   -0.13667133273249232


In [40]:
# Removing skewness. Taking root for positive skewed data and power for negatively skewed data
predicted_data["Var_1"] = np.sqrt(predicted_data["Var_1"])
predicted_data["Var_2"] = np.sqrt(predicted_data["Var_2"])
predicted_data["Var_3"] = np.cbrt(predicted_data["Var_3"])
predicted_data["Var_7"] = np.cbrt(predicted_data["Var_7"])
predicted_data["Var_8"] = np.cbrt(predicted_data["Var_8"])
predicted_data["Var_12"] = np.power(predicted_data["Var_12"], 2)
#predicted_data["Var_4"] = np.power(predicted_data["Var_4"], 6)

In [41]:
data_skewness(predicted_data)

Var_1 :   0.49872883873460444
Var_2 :   0.3576557463036899
Var_3 :   0.9441237516931893
Var_4 :   -1.284574825229781
Var_7 :   3.9417182051002477
Var_8 :   0.3735575358600709
Var_11 :   2.131511019460773
Var_12 :   -1.4997594448550235
Var_13 :   -0.13667133273249232


In [42]:
# After model building, we identified various features which were less important to model building. So we removed those. 
model_data = predicted_data.drop(['Var_5_Class_5', 'Var_5_Class_1', 'Var_5_Class_4'], axis=1)

In [43]:
predictions = rf_model.predict(model_data)

In [44]:
predictions_probability = rf_model.predict_proba(model_data)

In [45]:
predictions_probability_0 = predictions_probability[:,0]

In [46]:
prediction_confidence = []
for i in predictions_probability_0:
    if (i > 0.7):
        prediction_confidence.append(0)
    elif (i < 0.3):
        prediction_confidence.append(100)
    else:
        prediction_confidence.append(50)
        


In [47]:
output_file_df = pd.DataFrame({
    "DataSet": 3,
    "PredictionSet": 3,
    "Prediction": predictions,
   # "Pred_proba_0": predictions_probability_0,
   # "Pred_proba_1": predictions_probability_1,
    "Confidence": prediction_confidence
})

In [48]:
output_file_df.to_csv("Final_File_DS3_Pred3.csv", index=False)