## Exploring the data

In [None]:
import pandas as pd
import numpy as np

#### Importing the data`

In [None]:
data = pd.read_csv("Main Data/datacopy.csv")

#### Exploring the data

In [None]:
print("total number of rows : ", len(data))

In [None]:
data.info()

In [None]:
data= data.iloc[0: ,0: 48]

In [None]:
data.info()

In [None]:
all_cols = pd.DataFrame(pd.Series(data.columns))
all_cols

#### Selecting Columns 

In [None]:
s_cols = all_cols.loc[[4, 7, 10, 11, 14, 16, 19, 20, 21, 22, 24, 27, 28, 30, 31, 33, 34, 35, 38, 39, 46, 47]]
s_cols # Selected Columns

#### Reducing the data to the selected columns

In [None]:
# Selected Data
s_data = data.iloc[:, [4, 7, 10, 11, 14, 16, 19, 20, 21, 22, 24, 27, 28, 30, 31, 33, 34, 35, 38, 39, 46, 47]] # Selected the columns

In [None]:
s_data.head()

In [None]:
s_data.columns

#### Exploring the reduced data

In [None]:
s_data.info()

## Handling Missing Values

In [None]:
s_data.isnull().sum() # number of missing values in each column

#### Dropping those rows with all values missing

In [None]:
s_data.dropna(how = 'all').shape

#### Handling Age

In [None]:
s_data["Age"].isnull().sum() # Number of missing values in age

In [None]:
s_data[s_data["Age"].isnull()]

In [None]:
s_data.dropna(axis = 0, subset = ['Age'], inplace = True) # dropping the rows with Age = NaN

In [None]:
s_data.isnull().sum() # Number of missing values in each case

#### Handling Gender

In [None]:
s_data['Gender'].isnull().sum()

In [None]:
s_data[s_data['Gender'].isnull()].isnull().sum(axis = 0)

In [None]:
mode = s_data['Gender'].mode()[0] # mode returns a series

In [None]:
# 1.0 is a male and it is the mode of Gender
s_data['Gender'].replace(np.nan, mode, inplace = True)

In [None]:
s_data.isnull().sum()

#### Handling MRS

In [None]:
s_data['MRS'].isnull().sum() # number of rows missing values in MRS

In [None]:
s_data[s_data['MRS'].isnull()]

In [None]:
s_data[s_data['MRS'].isnull()].isnull().sum()

In [None]:
s_data['MRS'].value_counts(dropna = False)

In [None]:
mode = s_data['MRS'].mode()[0]

In [None]:
s_data['MRS'].replace(np.nan, mode, inplace = True)

In [None]:
s_data.isnull().sum()

#### Handling CODE SMOKER 1 yes - 2 NO

In [None]:
s_data['CODE SMOKER 1 yes - 2 NO'].isnull().sum()

In [None]:
s_data[s_data['CODE SMOKER 1 yes - 2 NO'].isnull()]

In [None]:
s_data[s_data['CODE SMOKER 1 yes - 2 NO'].isnull()].isnull().sum()

In [None]:
mode = s_data['CODE SMOKER 1 yes - 2 NO'].mode()[0]

In [None]:
s_data['CODE SMOKER 1 yes - 2 NO'].replace(np.nan, mode, inplace = True)

In [None]:
s_data.isnull().sum()

#### Resetting the indices after dropping the rows

In [None]:
s_data.reset_index(drop = True, inplace = True)

## Changing Datatypes

In [None]:
s_data.info()

In [None]:
s_cols.reset_index()

#### Changing the datatype of Age

In [None]:
s_data['Age'] = s_data['Age'].astype('int64')

In [None]:
s_data['Age'].dtype

In [None]:
s_data['Age']

#### Gender

In [None]:
s_data["Gender"].dtype

In [None]:
s_data['Gender'] = s_data["Gender"].astype("int64")

In [None]:
s_data_backup1 = s_data

#### Changing the datatype remaining columns

In [None]:
s_data.iloc[:, 2: 9] = s_data.iloc[:, 2: 9].astype('int64')

In [None]:
s_data.info()

In [None]:
s_data.iloc[:, 10: 12] = s_data.iloc[:, 10: 12].astype('int64')

In [None]:
s_data.iloc[:, 13: 16] = s_data.iloc[:, 13: 16].astype('int64')

In [None]:
s_data.iloc[:, 18] = s_data.iloc[:, 18].astype('int64')

In [None]:
s_data.iloc[:, 20: 22] = s_data.iloc[:, 20: 22].astype('int64')

## Dividing the Blood Pressure column into systolic blood pressure and diastolic blood pressure

In [None]:
ser = s_data['Blood Pressure at the time of admission']

In [None]:
systolic = []
diastolic = []

for i in range(len(ser)):
    s, d = ser[i].split('/')
    systolic.append(int(s))
    diastolic.append(int(d))

#### Inserting the columns Systolic BP and Diastolic BP

In [None]:
s_data['Systolic BP'] = systolic

In [None]:
s_data['Diastolic BP'] = diastolic

In [None]:
s_data.info()

#### Removing the column Blood Pressure at the time of admission

In [None]:
s_data.drop(["Blood Pressure at the time of admission"], axis = 1, inplace = True)

## Cleaned data

In [None]:
s_data.info()

#### Changing the order of columns

In [None]:
s_data.columns.tolist()

In [None]:
l = ['Age',
 'Gender',
 'Door to needle time (min )',
 'Door-door time',
 'CODE of Neurological Deficit ',
 'CODING WINDOW ',
 'MRS',
 'Hyper tension 1-yes              2- No',
 'DM',
 'Blood Sugar at the time of admission ( mg/dl)',
 'Systolic BP',
 'Diastolic BP',
 'CODE SMOKER 1 yes - 2 NO',
 'History of Stroke-1 yes - 2 No',
 'CODING ANT. POST. COMBINATION',
 'LARGE Vs SMALL ARTERY',
 ' Dose of Actilyse/ Weight(mg)',
 'HAGE',
 'NIHSS on admission.1',
 'NIHSS after 1 hr',
 '  NIHSS after 6 hrs',
 'coding of Complications',
 'Outcome']

In [None]:
s_data = s_data[l]

In [None]:
s_data.info()

#### Storing the cleaned data

In [None]:
s_data.to_excel("cleanData.xlsx")