# Train/Test Split

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
# predict Species
iris_df = pd.read_csv('/Users/muhammadraza/Documents/GitHub/BIPM/Data Science/data/Dataset_Iris.csv')

In [4]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 150 non-null    int64  
 1   Sepal Length (cm)  150 non-null    float64
 2   Sepal Width (cm)   150 non-null    float64
 3   Petal Length (cm)  150 non-null    float64
 4   Petal Width (cm)   150 non-null    float64
 5   Species            150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [11]:
# Step 1: separate training data from prediction target
# =====================================================
# (also get rid of Id-column)
# Naming convention: training data = X; prediction target = y
X = iris_df.drop(columns = ["Species"])
y = iris_df["Species"]

In [12]:
# Step 2: Apply Train/Test Split
# ==============================
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# function will shuffle the dataset before separation (good)
# test-size: 20-30% of the data (rule of thumb)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [13]:
X_train

Unnamed: 0,Id,Sepal Length (cm),Sepal Width (cm),Petal Length (cm),Petal Width (cm)
98,99,5.1,2.5,3.0,1.1
68,69,6.2,2.2,4.5,1.5
19,20,5.1,3.8,1.5,0.3
143,144,6.8,3.2,5.9,2.3
99,100,5.7,2.8,4.1,1.3
...,...,...,...,...,...
37,38,4.9,3.1,1.5,0.1
79,80,5.7,2.6,3.5,1.0
33,34,5.5,4.2,1.4,0.2
94,95,5.6,2.7,4.2,1.3


# 1. Quick Sanity Checks for your Dataset

Possible Problems
- Duplicated rows
- Numbers are saved as strings



## 1.1 Check for duplicated values

In [15]:
# Netflix-dataset
netflix_df = pd.read_csv("/Users/muhammadraza/Documents/GitHub/BIPM/Data Science/data/Dataset_Netflix.csv")
netflix_df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,48,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,27.612,8.2
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,18.216,7.8
3,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,17.505,7.8
4,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,95.337,7.7


In [16]:
print(netflix_df.duplicated().sum())

0


In [18]:
# In case of dupliacted rows: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html
netflix_df.drop_duplicates(inplace = True)

## 1.2 Check for wrong datatypes
Pandas datatypes (64 refers to the amount of memory that is used to save a value)
- int64 (= int in Python)
- float64 (= float in Python)
- object (= string in Python)
- datetime64 (= datetime)



In [19]:
netflix_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5806 entries, 0 to 5805
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5806 non-null   object 
 1   title                 5805 non-null   object 
 2   type                  5806 non-null   object 
 3   description           5788 non-null   object 
 4   release_year          5806 non-null   int64  
 5   age_certification     3196 non-null   object 
 6   runtime               5806 non-null   int64  
 7   genres                5806 non-null   object 
 8   production_countries  5806 non-null   object 
 9   seasons               2047 non-null   float64
 10  imdb_id               5362 non-null   object 
 11  imdb_score            5283 non-null   float64
 12  imdb_votes            5267 non-null   float64
 13  tmdb_popularity       5712 non-null   float64
 14  tmdb_score            5488 non-null   float64
dtypes: float64(5), int64(

In [20]:
# As an example: convert the runtime column to type string
netflix_df["runtime"] = netflix_df["runtime"].astype("object")

# Look at the dtypes of the dataset
print(netflix_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5806 entries, 0 to 5805
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5806 non-null   object 
 1   title                 5805 non-null   object 
 2   type                  5806 non-null   object 
 3   description           5788 non-null   object 
 4   release_year          5806 non-null   int64  
 5   age_certification     3196 non-null   object 
 6   runtime               5806 non-null   object 
 7   genres                5806 non-null   object 
 8   production_countries  5806 non-null   object 
 9   seasons               2047 non-null   float64
 10  imdb_id               5362 non-null   object 
 11  imdb_score            5283 non-null   float64
 12  imdb_votes            5267 non-null   float64
 13  tmdb_popularity       5712 non-null   float64
 14  tmdb_score            5488 non-null   float64
dtypes: float64(5), int64(

In [None]:
# As int is actually ok, we change it back
netflix_df["runtime"] = netflix_df["runtime"].astype(int)

# Look at the dtypes of the dataset
print(netflix_df.info())

# 2. Remove and replace missing values (columns/rows)

## 2.1 Remove Missing Values entirely (to be avoided)

In [None]:
# Example 1: Remove rows with null values from individual columns

# Count missing values in each column
print("Initial length:", len(netflix_df), "rows")
print(netflix_df.isnull().sum())

In [None]:
# Create a new df where we remove missing values from the "description"-column
netflix_df_subset = netflix_df[netflix_df["description"].notnull()]
    
print("New length:", len(netflix_df_subset), "rows")
print(netflix_df_subset.isnull().sum())

In [None]:
# Example 2: Drop entire columns or rows
#  - source: # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

# remove all rows that contain empty values
without_rows_df = netflix_df_subset.dropna(axis=0)
print("Removed rows with missing values.")
print("New length:", len(without_rows_df), "rows")
print(without_rows_df.isnull().sum())


In [None]:
# remove all columns that contain empty values
without_columns_df = netflix_df_subset.dropna(axis=1)
print("Removed columns with missing values:")
print(without_columns_df.isnull().sum())

## 2.2 Replace missing values (better strategy)

In [None]:
# Reload Netflix-dataset
netflix_df = pd.read_csv("../data/Dataset_Netflix.csv")
netflix_df.info()

### 2.2.1 Strategy: Replace missing values with the mean
For numeric values without outliers: replace missing values with the mean

In [None]:
netflix_df.head()

In [None]:
# Changes output of the transformers to Pandas
from sklearn import set_config
set_config(transform_output="pandas")

# import the imputing function
from sklearn.impute import SimpleImputer

In [None]:
# define imputer and strategy
imputer = SimpleImputer(strategy="mean", add_indicator=True)

In [None]:
# applying fit and transform separately
imputer.fit(netflix_df[['imdb_votes']])

# need to assign the output of the imputer to 2 columns
# the original columns gets overwritten and a new column contains the indicator
netflix_df[['imdb_votes', 'missingindicator_imdb_votes']] = imputer.transform(netflix_df[['imdb_votes']])

In [None]:
netflix_df[['imdb_votes', 'missingindicator_imdb_votes']]

In [None]:
# applying fit and transform together

# Reload Netflix-dataset
netflix_df = pd.read_csv("../data/Dataset_Netflix.csv")

# apply fit and transform
netflix_df[['imdb_votes', 'missingindicator_imdb_votes']] = imputer.fit_transform(netflix_df[['imdb_votes']])

In [None]:
netflix_df[['imdb_votes', 'missingindicator_imdb_votes']]

### 2.2.2 Strategy: Replace missing values with the median
For numeric values with outliers: replace missing values with the median

In [None]:
print(netflix_df.isnull().sum())

In [None]:
# Check for outliers
netflix_df.describe()

In [None]:
# Column "tmdb_popularity" has a high max compared to the mean - most likely outliers
# Let's investigate a bit deeper...
netflix_check = netflix_df.sort_values(by = "tmdb_popularity", ascending=False)

In [None]:
# tmdb_popularity seems to have outliers
netflix_check.head(10)

In [None]:
# define imputer and strategy
imputer = SimpleImputer(strategy="median", add_indicator=True)

# applying fit and transform together
netflix_df[['tmdb_popularity', 'missingindicator_tmdb_popularity']] = imputer.fit_transform(netflix_df[['tmdb_popularity']])

In [None]:
# check for null-values
print(netflix_df.isnull().sum())

### 2.2.3 Strategy: Replace missing values with the mode
For categorical values: replace missing values with the mode

In [None]:
netflix_df['age_certification'].value_counts()

In [None]:
# define imputer and strategy
imputer = SimpleImputer(strategy="most_frequent", add_indicator=True)

# applying fit and transform together
netflix_df[['age_certification', 'missingindicator_age_certification']] = imputer.fit_transform(netflix_df[['age_certification']])


In [None]:
netflix_df['age_certification'].value_counts()

### 2.2.4 Column Transformer: Applying transformations to multiple columns 

In [None]:
# reload dataset
netflix_df = pd.read_csv('../data/Dataset_Netflix.csv')

In [None]:
from sklearn.compose import ColumnTransformer

imputer_mean = SimpleImputer(strategy="mean", add_indicator=True)

ct = ColumnTransformer(
    [('imputer', imputer_mean, ['imdb_votes'])],
    remainder='passthrough'
)

ct.fit_transform(netflix_df)

### 2.2.5 Exercise

In [None]:
# a) Define train-test split for Netflix Dataset
#       - prediction-target = imdb_score


In [None]:
# b) Define transformations for selected columns with SimpleImputer and column transformer
#       - imdb_votes --> mean
#       - tmdb_popularity --> median/most frequent
#       - age_certification --> mode


In [None]:
# c) apply transformations to X_train and X_test


<a class="anchor" id="chapter_3"></a>

# 3. Stratify Sample Populations

- When to use: If class values are imbalanced (but actually, it can could be used all the time...)
- train_test_split() contains a parameter that keeps the original distribution of values for train/test splits

In [None]:
# Load IRIS-dataset
iris_df = pd.read_csv('../data/Dataset_Iris.csv')
iris_df.head()

## 3.1 Train-Test split without stratification

In [None]:
from sklearn.model_selection import train_test_split

# Create training set
# as we want to predict the species, we need to create X as a dataframe without this column
X = iris_df.drop("Species", axis=1)

# We save the species-column in a separate dataframe
y = iris_df[["Species"]]

# Perform Train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5 ,stratify=None, random_state=42)

In [None]:
print("Original Distribution:")
print(y["Species"].value_counts())
print()
print("Distribution of values in train set without stratification:")
print(y_train["Species"].value_counts())

## 3.2 Train-Test split with stratification


In [None]:
# We use stratified sampling to split the data into train split while keeping the species-distribution in train and test-set, respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=42)

In [None]:
print("Original Distribution:")
print(y["Species"].value_counts())
print()
print("Distribution of values in train set with stratification:")
print(y_train["Species"].value_counts())

<a class="anchor" id="chapter_4"></a>

# 4. Standardization for Numerical Variables

- Problem: High variance in different classes in a dataset
- **These methods are applied to continuous numerical data (not to classes/categories)**
- When to use?
  - Especially important for KNN, because the values in KNN are calculated by using a linear distance metric
  - if features have a high variance
  - if features are continuous and on a different scale (e.g. age, weight)
  - can improve performance of other models as well (e.g., decision trees)

Standardization
- Values are centered around the mean and scaled to z-values (measured in standard distributions)
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
- https://www.statisticshowto.com/probability-and-statistics/z-score/

In [None]:
# Example dataframe
X_train = pd.DataFrame({'age':[10, 100, 19, 21, 25], 
                        'income':[10000, 20000, 100000, 50000, 30000]})

X_train

In [None]:
X_train.mean()

In [None]:
X_train.std()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(X_train)

## 4.1 Application on Dataset

In [None]:
# Exercise with wine_df
wine_df = pd.read_csv("../data/Dataset_Red_Wine_Quality.csv")

X = wine_df.drop(columns="quality")
y = wine_df["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Potential problem: high variance in a dataset:
print(X_train.var())

In [None]:
# Another problem: different value ranges (see x-axes)
import matplotlib.pyplot as plt
X_train.hist(figsize=(14, 10))
plt.show()


In [None]:
# Use standard Scaler with column-transformer
ss = StandardScaler()

columns_to_transform = X_train.columns

ct_standardization = ColumnTransformer(
    [('standard_scaler', ss, X_train.columns)],
    remainder='passthrough'
)

X_train = ct_standardization.fit_transform(X_train)

In [None]:
X_train.var()

In [None]:
# show distributions 
import matplotlib.pyplot as plt
X_train.hist(figsize=(14, 10))
plt.show()

## 4.2 Application of different Scalers

- Problem: we still have skewed distributions in some of the variables.
- For those, standardization did not have the desired effect
- Let's create two lists: columns that work well with StandardScaler and columns that doesn not.


In [None]:
# Use PowerTransformer for columns where StandardScaler did not work properly
from sklearn.preprocessing import PowerTransformer

wine_df = pd.read_csv("../data/Dataset_Red_Wine_Quality.csv")

X = wine_df.drop(columns="quality")
y = wine_df["quality"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

ss = StandardScaler()
pt = PowerTransformer()

# columns that show a normal distribution (or very similar) after StandardScaler was applied
columns_StandardScaler = ["density", "pH", 'sulphates', 'alcohol', 'fixed acidity', 'volatile acidity', 'chlorides', 'citric acid']

# for the most obvious skewed distributions, we create another list 
columns_skewed_distribution = ['free sulfur dioxide', 'total sulfur dioxide', 'residual sugar']

ct_standardization = ColumnTransformer(
    [('standard_scaler', ss, columns_StandardScaler),
    ('power_transfomer', pt, columns_skewed_distribution)],
    remainder='passthrough'
)

X_train = ct_standardization.fit_transform(X_train)

In [None]:
X_train.var()

In [None]:
# show distributions 
import matplotlib.pyplot as plt
X_train.hist(figsize=(14, 10))
plt.show()

# 5. Dummy Coding

Encoding of categorical variables

## 5.1 One-Hot encoding of categorical variables

In [None]:
X_train = pd.DataFrame({'shape':['square', 'square', 'oval', 'circle']})
X_train

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
# left to right the columns are in alphabetical sequence (circle, oval, square)
ohe = OneHotEncoder(sparse_output=False)
ohe.fit_transform(X_train)

In [None]:
ohe.get_feature_names_out()

## 5.2 Ordinal encoding for categorical features

In [None]:
X_train = pd.DataFrame({'rating':['very good', 'very bad', 'very good', 'OK', 'good', 'very good','bad']})
X_train

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
# Define the order of categories
categories = [['very bad', 'bad', 'OK', 'good', 'very good']]
oe = OrdinalEncoder(categories=categories)
oe.fit_transform(X_train)

## 5.3 Application to dataset

In [None]:
iris_df = pd.read_csv("../data/Dataset_Iris.csv")
iris_df.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

ct_one_hot = ColumnTransformer(
    [('OneHotEncoder', ohe, ["Species"])],
    remainder='passthrough'
)

ct_one_hot.fit_transform(iris_df)


<a class="anchor" id="chapter_6"></a>

# 6. Feature Selection
Source: https://www.analyticsvidhya.com/blog/2020/10/feature-selection-techniques-in-machine-learning/

Goal: remove unnecessary features from dataset that might create noise




## 6.1 Correlation Matrix

*   Good features correlate highly with the prediction target
*   Good features do not correlate among themselves

In [None]:
# Prediction of quality-levels based wine-features
wine_df = pd.read_csv("../data/Dataset_Red_Wine_Quality.csv")
wine_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# correlation matrix
cor = wine_df.corr()

plt.figure(figsize=(10,6))
sns.heatmap(cor, annot=True)


## 6.2 Information Gain
* Information gain calculates the reduction in entropy
* It can be used for feature selection by evaluating the Information gain of each variable in the context of the target variable.
*   High values indicate a strong predictive power
* https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html


In [None]:
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt

X = wine_df.drop("quality", axis=1)
y = wine_df["quality"]

importances = mutual_info_classif(X, y)
feature_importances = pd.Series(importances, wine_df.columns[0:len(wine_df.columns)-1])
feature_importances.plot(kind="bar", color="teal")
plt.show()


## 6.3 Automated Methods
Link: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from numpy import array 

iris_df = pd.read_csv("../data/Dataset_Iris.csv")


In [None]:
# Create training set and prediction target
X = iris_df.drop("Species", axis=1)
y = iris_df[["Species"]]

# Perform feature selection
# Set k to the number of features you want to identify
select = SelectKBest(score_func=chi2, k=3)
select.fit_transform(X,y)

# Print feature names
filter = select.get_support() 
features = array(X.columns)

print("All features:")
print(features)
 
print("Selected best 3:")
print(features[filter])


# 7. Pipelines

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# define classifier (= ML-model)
clf = DecisionTreeClassifier()

# Define dataset
wine_df = pd.read_csv("../data/Dataset_Red_Wine_Quality.csv")
X = wine_df.drop(columns="quality")
y = wine_df["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Add Preprocessing Steps
ss = StandardScaler()
pt = PowerTransformer()
columns_StandardScaler = ["density", "pH", 'sulphates', 'alcohol', 'fixed acidity', 'volatile acidity', 'chlorides', 'citric acid']
columns_skewed_distribution = ['free sulfur dioxide', 'total sulfur dioxide', 'residual sugar']
ct = ColumnTransformer(
    [('standard_scaler', ss, columns_StandardScaler),
    ('power_transfomer', pt, columns_skewed_distribution)],
    remainder='passthrough'
)

# create pipeline
pipe = Pipeline([
    ('preprocessor', ct),
    ('classifier', clf)]
)

In [None]:
pipe

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))