In [146]:
import numpy as np
import pandas as pd

# On a custom dataset first

## Creating a pandas dataframe for a small dataset

In [147]:
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'YearsExperience': [2, 4, np.nan, 3, 5],
    'Salary': [50000, 60000, 45000, np.nan, 70000],
    'PerformanceRating': ['Excellent', 'Good', 'Poor', 'Good', 'Excellent']
}

df = pd.DataFrame(data)
print(df)

   EmployeeID  YearsExperience   Salary PerformanceRating
0           1              2.0  50000.0         Excellent
1           2              4.0  60000.0              Good
2           3              NaN  45000.0              Poor
3           4              3.0      NaN              Good
4           5              5.0  70000.0         Excellent


## Data Cleaning

In [148]:
df.dropna(inplace=True)
print(df)

   EmployeeID  YearsExperience   Salary PerformanceRating
0           1              2.0  50000.0         Excellent
1           2              4.0  60000.0              Good
4           5              5.0  70000.0         Excellent


## Transforming the data

In [149]:
df['YearsExperience'] = df['YearsExperience'] ** 2
print(df)

   EmployeeID  YearsExperience   Salary PerformanceRating
0           1              4.0  50000.0         Excellent
1           2             16.0  60000.0              Good
4           5             25.0  70000.0         Excellent


## Reducing the data

In [150]:
df.drop('EmployeeID', axis=1, inplace=True)
print(df)

   YearsExperience   Salary PerformanceRating
0              4.0  50000.0         Excellent
1             16.0  60000.0              Good
4             25.0  70000.0         Excellent


## Assessing data quality

In [151]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 4
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   YearsExperience    3 non-null      float64
 1   Salary             3 non-null      float64
 2   PerformanceRating  3 non-null      object 
dtypes: float64(2), object(1)
memory usage: 96.0+ bytes
None


## Encoding categorical data

In [152]:
df['PerformanceRating'] = df['PerformanceRating'].map({'Poor': 0, 'Good': 1, 'Excellent': 2})
print(df)

   YearsExperience   Salary  PerformanceRating
0              4.0  50000.0                  2
1             16.0  60000.0                  1
4             25.0  70000.0                  2


## Splitting the dataset

In [153]:
from sklearn.model_selection import train_test_split

X = df[['YearsExperience', 'Salary']]
y = df['PerformanceRating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train)
print()
print(X_test)

   YearsExperience   Salary
4             25.0  70000.0
1             16.0  60000.0

   YearsExperience   Salary
0              4.0  50000.0


## Scaling the features

In [154]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled)
print(X_test_scaled)

[[ 1.  1.]
 [-1. -1.]]
[[-3.66666667 -3.        ]]


# Applying on titanic dataset

## Step 1: Acquiring the dataset

In [155]:
titanic_df = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
print(titanic_df.head(10))
print("------------------------------------------------------------------------------")
print(titanic_df.dtypes)
print("------------------------------------------------------------------------------")
print(titanic_df.describe(include="all"))

   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   
5         0       3                                    Mr. James Moran   
6         0       1                             Mr. Timothy J McCarthy   
7         0       3                      Master. Gosta Leonard Palsson   
8         1       3   Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson   
9         1       2                 Mrs. Nicholas (Adele Achem) Nasser   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                

## Step 2: Cleaning the data

### Generating and handling missing values

In [156]:
import pandas as pd
import numpy as np

np.random.seed(42)

data = {
    'Survived': np.random.choice([0, 1], size=200),
    'Pclass': np.random.choice([1, 2, 3], size=200),
    'Name': [f'Person_{i}' for i in range(1, 201)],
    'Sex': np.random.choice(['male', 'female'], size=200),
    'Age': np.random.choice([np.nan, *np.arange(10, 71)], size=200),
    'Siblings/Spouses Aboard': np.random.randint(0, 4, size=200),
    'Parents/Children Aboard': np.random.randint(0, 3, size=200),
    'Fare': np.random.choice([np.nan, *np.arange(10, 101, 10)], size=200)
}

df = pd.DataFrame(data)

for column in ['Age', 'Fare']:
    df.loc[df.sample(frac=0.2).index, column] = np.nan

print(df)

print("---------------------------------------")

from sklearn.impute import SimpleImputer

columns_with_missing = ['Age', 'Fare']

imputer = SimpleImputer(strategy='mean')  # You can choose 'median', 'most_frequent', or a constant value

df[columns_with_missing] = imputer.fit_transform(df[columns_with_missing])


titanic_df = pd.concat([titanic_df, df], ignore_index=True)
# titanic_df.drop('FamilySize',axis=1,inplace=True)
print(titanic_df.tail(10))

     Survived  Pclass        Name     Sex   Age  Siblings/Spouses Aboard  \
0           0       3    Person_1    male  13.0                        1   
1           1       3    Person_2  female  44.0                        3   
2           0       1    Person_3    male  42.0                        2   
3           0       1    Person_4    male   NaN                        2   
4           0       2    Person_5    male  39.0                        0   
..        ...     ...         ...     ...   ...                      ...   
195         1       2  Person_196    male  15.0                        2   
196         1       3  Person_197    male  33.0                        3   
197         1       1  Person_198  female  53.0                        2   
198         0       1  Person_199  female  12.0                        1   
199         0       1  Person_200  female  44.0                        1   

     Parents/Children Aboard  Fare  
0                          1  20.0  
1            

In [157]:
titanic_df.dropna(subset=['Age'], inplace=True)
print(titanic_df)

      Survived  Pclass                                               Name  \
0            0       3                             Mr. Owen Harris Braund   
1            1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2            1       3                              Miss. Laina Heikkinen   
3            1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4            0       3                            Mr. William Henry Allen   
...        ...     ...                                                ...   
1082         1       2                                         Person_196   
1083         1       3                                         Person_197   
1084         1       1                                         Person_198   
1085         0       1                                         Person_199   
1086         0       1                                         Person_200   

         Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard  \
0  

## Step 3: Transforming the data

In [158]:
titanic_df['FamilySize'] = titanic_df['Siblings/Spouses Aboard'] + titanic_df['Parents/Children Aboard']
print(titanic_df)

      Survived  Pclass                                               Name  \
0            0       3                             Mr. Owen Harris Braund   
1            1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2            1       3                              Miss. Laina Heikkinen   
3            1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4            0       3                            Mr. William Henry Allen   
...        ...     ...                                                ...   
1082         1       2                                         Person_196   
1083         1       3                                         Person_197   
1084         1       1                                         Person_198   
1085         0       1                                         Person_199   
1086         0       1                                         Person_200   

         Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard  \
0  

## Step 4: Reducing the data

In [159]:
titanic_df.drop(['Name'], axis=1, inplace=True)
print(titanic_df)

      Survived  Pclass     Sex   Age  Siblings/Spouses Aboard  \
0            0       3    male  22.0                        1   
1            1       1  female  38.0                        1   
2            1       3  female  26.0                        0   
3            1       1  female  35.0                        1   
4            0       3    male  35.0                        0   
...        ...     ...     ...   ...                      ...   
1082         1       2    male  15.0                        2   
1083         1       3    male  33.0                        3   
1084         1       1  female  53.0                        2   
1085         0       1  female  12.0                        1   
1086         0       1  female  44.0                        1   

      Parents/Children Aboard       Fare  FamilySize  
0                           0   7.250000           1  
1                           0  71.283300           1  
2                           0   7.925000           0  

## Step 5: Assessing data quality

In [160]:
print(titanic_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 1087 non-null   int64  
 1   Pclass                   1087 non-null   int64  
 2   Sex                      1087 non-null   object 
 3   Age                      1087 non-null   float64
 4   Siblings/Spouses Aboard  1087 non-null   int64  
 5   Parents/Children Aboard  1087 non-null   int64  
 6   Fare                     1087 non-null   float64
 7   FamilySize               1087 non-null   int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 68.1+ KB
None


## Step 6: Encoding categorical data

In [161]:
titanic_df['Sex'] = titanic_df['Sex'].map({'male': 0, 'female': 1})
print(titanic_df)

      Survived  Pclass  Sex   Age  Siblings/Spouses Aboard  \
0            0       3    0  22.0                        1   
1            1       1    1  38.0                        1   
2            1       3    1  26.0                        0   
3            1       1    1  35.0                        1   
4            0       3    0  35.0                        0   
...        ...     ...  ...   ...                      ...   
1082         1       2    0  15.0                        2   
1083         1       3    0  33.0                        3   
1084         1       1    1  53.0                        2   
1085         0       1    1  12.0                        1   
1086         0       1    1  44.0                        1   

      Parents/Children Aboard       Fare  FamilySize  
0                           0   7.250000           1  
1                           0  71.283300           1  
2                           0   7.925000           0  
3                           0  53.1

In [162]:
# Step 6: Encoding categorical data with One-Hot Encoding in scikit-learn. Here I am onehotencoding Pclass ONLY for demonstration purposes
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['Pclass'])
    ],
    remainder='passthrough'
)

X_encoded = column_transformer.fit_transform(titanic_df)

X_encoded_df = pd.DataFrame(X_encoded, columns=['Pclass_1', 'Pclass_2', 'Pclass_3', 'Survived', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare', 'FamilySize'])
print(X_encoded_df)

      Pclass_1  Pclass_2  Pclass_3  Survived  Sex   Age  \
0          0.0       0.0       1.0       0.0  0.0  22.0   
1          1.0       0.0       0.0       1.0  1.0  38.0   
2          0.0       0.0       1.0       1.0  1.0  26.0   
3          1.0       0.0       0.0       1.0  1.0  35.0   
4          0.0       0.0       1.0       0.0  0.0  35.0   
...        ...       ...       ...       ...  ...   ...   
1082       0.0       1.0       0.0       1.0  0.0  15.0   
1083       0.0       0.0       1.0       1.0  0.0  33.0   
1084       1.0       0.0       0.0       1.0  1.0  53.0   
1085       1.0       0.0       0.0       0.0  1.0  12.0   
1086       1.0       0.0       0.0       0.0  1.0  44.0   

      Siblings/Spouses Aboard  Parents/Children Aboard       Fare  FamilySize  
0                         1.0                      0.0   7.250000         1.0  
1                         1.0                      0.0  71.283300         1.0  
2                         0.0                      

## Step 7: Splitting the dataset

In [163]:
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train)
print()
print(X_test)

      Pclass  Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard  \
433        3    1  21.0                        2                        2   
877        3    0  33.0                        0                        0   
659        1    0  47.0                        0                        0   
2          3    1  26.0                        0                        0   
1086       1    1  44.0                        1                        2   
...      ...  ...   ...                      ...                      ...   
847        3    0  74.0                        0                        0   
715        3    0  33.0                        0                        0   
905        2    0  63.0                        3                        0   
235        2    0  44.0                        1                        0   
1061       1    1  54.0                        0                        1   

          Fare  FamilySize  
433    34.3750           4  
877     7.8958   

## Step 8: Scaling the features

In [164]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X_train_scaled)
print()
print(X_test_scaled)

[[ 0.89634447  1.26870391 -0.71233852 ...  1.85868186 -0.03807969
   1.81977501]
 [ 0.89634447 -0.78820597  0.09687279 ... -0.59975731 -0.58757962
  -0.71044017]
 [-1.45689826 -0.78820597  1.04095266 ... -0.59975731 -0.22043908
  -0.71044017]
 ...
 [-0.2802769  -0.78820597  2.11990108 ... -0.59975731  1.32377916
   1.18722122]
 [-0.2802769  -0.78820597  0.83864983 ... -0.59975731 -0.21187882
  -0.07788637]
 [-1.45689826  1.26870391  1.51299259 ...  0.62946227  1.32377916
  -0.07788637]]

[[-1.45689826  1.26870391  1.64786114 ...  0.62946227  0.97427793
  -0.07788637]
 [ 0.89634447 -0.78820597 -0.17286431 ... -0.59975731 -0.55428905
  -0.71044017]
 [ 0.89634447 -0.78820597  2.52450673 ... -0.59975731 -0.45052838
  -0.71044017]
 ...
 [-1.45689826  1.26870391 -0.91464135 ... -0.59975731  3.97019515
  -0.07788637]
 [ 0.89634447  1.26870391  0.90608411 ...  1.85868186  0.07865106
   2.45232881]
 [ 0.89634447 -0.78820597 -0.03799576 ... -0.59975731 -0.59008648
  -0.71044017]]
