# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [2]:
Self_harm_EDA = pd.read_csv('/Users/hayaldargin/Desktop/Springboard/Springboard_Work/Capstone_2/Final_Data/Self_harm_EDA.csv', index_col=0)
Self_harm_EDA 

Unnamed: 0,Sex,Race/Ethnicity,Age Group,Year,Injuries,Population
6,Male,White Non-Hispanice,00-04,2001,0,5862483
7,Male,Black,00-04,2001,0,1585658
8,Male,Hispanic,00-04,2001,0,1900490
9,Male,Other Non-Hispanic,00-04,2001,0,518145
11,Female,White Non-Hispanice,00-04,2001,0,5572690
...,...,...,...,...,...,...
5780,Male,Other Non-Hispanic,85+,2018,0,116478
5783,Female,White Non-Hispanice,85+,2018,0,3349454
5784,Female,Black,85+,2018,0,366329
5785,Female,Hispanic,85+,2018,0,304983


# 1.Splitting of the data set in Training and Validation sets

In [3]:
len(Self_harm_EDA) * .8, len(Self_harm_EDA) * .2

(2104.8, 526.2)

In [4]:

X_train, X_test, y_train, y_test = train_test_split(Self_harm_EDA.drop(columns='Injuries'), 
                                                    Self_harm_EDA.Injuries, test_size=0.2)

In [5]:
X_train.shape, X_test.shape

((2104, 5), (527, 5))

In [6]:
y_train.shape, y_test.shape

((2104,), (527,))

Here I have passed-in X and y as arguments in train_test_split, which splits X and y such that there is 20% testing data and 80% training data successfully split between X_train, X_test, y_train, and y_test.

# 2. Taking Care of Missing Values

In [7]:
Self_harm_EDA.isna().sum()

Sex                0
Race/Ethnicity    39
Age Group          0
Year               0
Injuries           0
Population         0
dtype: int64

Here I can see that we have 39 missing values in Race/Ethnicity column. One approach to fill in missing values is to fill it with the mean of that column, which is the average of that column but Race/Ethnicity column is categorical that is by I will drop these missing rows. 

In [8]:
Self_harm_cleaned = Self_harm_EDA.dropna(axis = 0, how ='any')    
Self_harm_cleaned

Unnamed: 0,Sex,Race/Ethnicity,Age Group,Year,Injuries,Population
6,Male,White Non-Hispanice,00-04,2001,0,5862483
7,Male,Black,00-04,2001,0,1585658
8,Male,Hispanic,00-04,2001,0,1900490
9,Male,Other Non-Hispanic,00-04,2001,0,518145
11,Female,White Non-Hispanice,00-04,2001,0,5572690
...,...,...,...,...,...,...
5780,Male,Other Non-Hispanic,85+,2018,0,116478
5783,Female,White Non-Hispanice,85+,2018,0,3349454
5784,Female,Black,85+,2018,0,366329
5785,Female,Hispanic,85+,2018,0,304983


In [9]:
Self_harm_cleaned.isna().sum()

Sex               0
Race/Ethnicity    0
Age Group         0
Year              0
Injuries          0
Population        0
dtype: int64

And now, I can see that there are no missing values in the data set.

# 3. Taking care of Categorical Features

I can take care of categorical features by converting them to integers by using Label Encoding method.

In [10]:
Self_harm_cleaned["Sex"].value_counts()

Female    1296
Male      1296
Name: Sex, dtype: int64

In [11]:
Self_harm_cleaned["Race/Ethnicity"].value_counts()

Other Non-Hispanic     648
Hispanic               648
White Non-Hispanice    648
Black                  648
Name: Race/Ethnicity, dtype: int64

In [12]:
convert_sex= {"Sex":     {"Male": 1, "Female": 0}}
               

In [13]:
Self_harm_cleaned = Self_harm_cleaned.replace(convert_sex)
Self_harm_cleaned.head()

Unnamed: 0,Sex,Race/Ethnicity,Age Group,Year,Injuries,Population
6,1,White Non-Hispanice,00-04,2001,0,5862483
7,1,Black,00-04,2001,0,1585658
8,1,Hispanic,00-04,2001,0,1900490
9,1,Other Non-Hispanic,00-04,2001,0,518145
11,0,White Non-Hispanice,00-04,2001,0,5572690


In [14]:
convert_race = { "Race/Ethnicity": {"Black":1, "Other Non-Hispanic":2, "Hispanic":3, "White Non-Hispanice":4}}

In [15]:
Self_harm_cleaned = Self_harm_cleaned.replace(convert_race)
Self_harm_cleaned.head()

Unnamed: 0,Sex,Race/Ethnicity,Age Group,Year,Injuries,Population
6,1,4,00-04,2001,0,5862483
7,1,1,00-04,2001,0,1585658
8,1,3,00-04,2001,0,1900490
9,1,2,00-04,2001,0,518145
11,0,4,00-04,2001,0,5572690


In [16]:
Self_harm_cleaned["Age Group"].value_counts()

60-64    144
80-84    144
15-19    144
40-44    144
05-09    144
00-04    144
10-14    144
65-69    144
30-34    144
85+      144
35-39    144
70-74    144
45-49    144
20-24    144
25-29    144
55-59    144
75-79    144
50-54    144
Name: Age Group, dtype: int64

In [17]:
convert_age = {"Age Group":{"00-04":1, "05-09":2,"10-14":3, "15-19":4,  "20-24":5, "25-29":6, "30-34":7, 
                            "35-39":8, "40-44":9, "45-49":10, "50-54":11, "55-59":12, "60-64":13, "65-69":14, 
                            "70-74":15, "75-79":16, "80-84":17, "85+":1}}

In [18]:
Self_harm_cleaned = Self_harm_cleaned.replace(convert_age)
Self_harm_cleaned.head()

Unnamed: 0,Sex,Race/Ethnicity,Age Group,Year,Injuries,Population
6,1,4,1,2001,0,5862483
7,1,1,1,2001,0,1585658
8,1,3,1,2001,0,1900490
9,1,2,1,2001,0,518145
11,0,4,1,2001,0,5572690


In [19]:
Self_harm_cleaned.dtypes

Sex               int64
Race/Ethnicity    int64
Age Group         int64
Year              int64
Injuries          int64
Population        int64
dtype: object

Now I have converted all my columns to numeric.

# 4. Normalizing the Dataset

This is the last part of data preprocessing, which is the normalization of my the dataset. It is proven from certain experimentation that Machine Learning and Deep Learning Models perform way better on a normalized data set as compared to a data set that is not normalized therefore i want to normalize my dataset. 
The goal of normalization here is to change values to a common scale without distorting the difference between the range of values.

In [20]:
Self_harm_cleaned.describe()

Unnamed: 0,Sex,Race/Ethnicity,Age Group,Year,Injuries,Population
count,2592.0,2592.0,2592.0,2592.0,2592.0,2592.0
mean,0.5,2.5,8.555556,2009.5,1877.717207,2132055.0
std,0.500096,1.11825,5.102424,5.189129,4785.736155,2221288.0
min,0.0,1.0,1.0,2001.0,0.0,32321.0
25%,0.0,1.75,4.0,2005.0,0.0,578435.5
50%,0.5,2.5,8.5,2009.5,0.0,1385744.0
75%,1.0,3.25,13.0,2014.0,0.0,2358336.0
max,1.0,4.0,17.0,2018.0,41408.0,8220183.0


In [21]:
from sklearn.preprocessing import Normalizer

norm = Normalizer()

Self_harm_cleaned.iloc[:,1:-1] = norm.fit_transform(Self_harm_cleaned.iloc[:,1:-1])

Self_harm_cleaned

Unnamed: 0,Sex,Race/Ethnicity,Age Group,Year,Injuries,Population
6,1,0.001999,0.000500,0.999998,0.0,5862483
7,1,0.000500,0.000500,1.000000,0.0,1585658
8,1,0.001499,0.000500,0.999999,0.0,1900490
9,1,0.000999,0.000500,0.999999,0.0,518145
11,0,0.001999,0.000500,0.999998,0.0,5572690
...,...,...,...,...,...,...
5780,1,0.000991,0.000496,0.999999,0.0,116478
5783,0,0.001982,0.000496,0.999998,0.0,3349454
5784,0,0.000496,0.000496,1.000000,0.0,366329
5785,0,0.001487,0.000496,0.999999,0.0,304983


In [22]:
Self_harm_cleaned.describe()

Unnamed: 0,Sex,Race/Ethnicity,Age Group,Year,Injuries,Population
count,2592.0,2592.0,2592.0,2592.0,2592.0,2592.0
mean,0.5,0.001005,0.003763,0.863963,0.188479,2132055.0
std,0.500096,0.00054,0.002781,0.285721,0.369415,2221288.0
min,0.0,9.6e-05,9.6e-05,0.048629,0.0,32321.0
25%,0.0,0.000498,0.000995,0.999964,0.0,578435.5
50%,0.5,0.000995,0.003477,0.999982,0.0,1385744.0
75%,1.0,0.001492,0.006461,0.999996,0.0,2358336.0
max,1.0,0.001999,0.008495,1.0,0.998817,8220183.0


End