### Import Relevant Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Loading Dataset

In [None]:
train_df = pd.read_csv("netflix_titles.csv")

### Basic Understanding of Data

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df.columns

In [None]:
train_df.describe()

In [None]:
train_df.describe(include='object')

In [None]:
train_df.info()

### Exploratory Data Analysis (EDA) and Feature Engineering

In [None]:
data = train_df.copy()

In [None]:
data.head() 

In [None]:
from dateutil import parser
data['date_added'] = data['date_added'].apply(lambda x: parser.parse(x, fuzzy=True) if pd.notnull(x) else None)

In [None]:
# This code uses the parser.parse function from the dateutil library with the fuzzy=True parameter,
# which allows it to automatically detect and parse various date formats. The apply function is used 
# to apply this parsing function to each element in the 'date_added' column.

In [None]:
data.dtypes

In [None]:
data['data_added_day'] = data['date_added'].dt.day

In [None]:
data['data_added_month'] = data['date_added'].dt.month

In [None]:
data['data_added_year'] = data['date_added'].dt.year

In [None]:
data.head()

In [None]:
data.drop(['date_added'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
# Visualizing Target Feature 'Type'

In [None]:
plt.figure(figsize=(10, 6)) 
plt.pie(data['type'].value_counts(), labels=data['type'].value_counts().keys(), explode=[0.1,0.0],autopct='%1.1f%%', textprops={'fontsize': 20, 'fontweight': 'bold'}, colors=['g', 'm'], shadow=True) 
plt.title('Type Feature Distribution')
plt.legend(loc = 1)
plt.show()

In [None]:
data.head()

In [None]:
#  How many unique show IDs are there?

In [None]:
unique_show_ids = data['show_id'].nunique()
print(f"There are {unique_show_ids} unique show IDs.")

# Plot a count plot for the unique show IDs
show_id_counts = data['show_id'].value_counts()
plt.figure(figsize=(10, 6))
sns.countplot(x=show_id_counts.values, color='purple')
plt.xlabel('Number of Appearances')
plt.ylabel('Number of Unique IDs')
plt.title('Distribution of Unique Show IDs')
plt.show()

In [None]:
 # What are the top 10 most common titles in the dataset?

In [None]:
top_titles = data['title'].value_counts().head(10)
print("Top 10 most common titles:")
print(top_titles)

# Plot a bar plot for the top 10 most common titles
plt.figure(figsize=(12, 6))
sns.barplot(x=top_titles.index, y=top_titles.values, palette='Set2')
plt.xlabel('Title')
plt.ylabel('Number of Appearances')
plt.title('Top 10 Most Common Titles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Who are the top 10 directors with the most content?

In [None]:
top_directors = data['director'].value_counts().head(10)
print("Top 10 directors with the most content:")
print(top_directors)

# Plot a bar plot for the top 10 directors
plt.figure(figsize=(12, 6))
sns.barplot(x=top_directors.index, y=top_directors.values, palette='Set1')
plt.xlabel('Director Name')
plt.ylabel('Number of Shows')
plt.title('Top 10 Directors Distribution')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Who are the top 10 most common cast members?

In [None]:
top_cast = data['cast'].value_counts().head(10)
print("Top 10 most common cast members:")
print(top_cast)

# Plot a bar plot for the top 10 most common cast members
plt.figure(figsize=(12, 6))
sns.barplot(x=top_cast.index, y=top_cast.values, palette='muted')
plt.xlabel('Cast Members')
plt.ylabel('Number of Appearances')
plt.title('Top 10 Most Common Cast Members')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
#  Which countries are most represented in the dataset?

In [None]:
top_countries = data['country'].value_counts().head(10)
print("Top 10 countries represented in the dataset:")
print(top_countries)

# Plot a bar plot for the most represented countries
plt.figure(figsize=(12, 6))
sns.barplot(x=top_countries.index, y=top_countries.values,  palette='viridis')
plt.xlabel('Countries')
plt.ylabel('Number of Contents')
plt.title('Top 10 Countries Represented in the Dataset')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# What is the distribution of content added dates according year?

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='data_added_year', data=data, order=data['data_added_year'].value_counts().index)
plt.title('Distribution of Content Added Dates')
plt.xticks(rotation=45, ha='right')
plt.show() 

In [None]:
# What is the distribution of content added dates according month?

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='data_added_month', data=data, order=data['data_added_month'].value_counts().index)
plt.title('Distribution of Content Added Dates')
plt.xticks(rotation=45, ha='right')
plt.show() 

In [None]:
# What is the distribution of release years for content?

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(data['release_year'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Release Years')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
 # What is the distribution of content ratings?

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', data=data, order=data['rating'].value_counts().index)
plt.title('Distribution of Content Ratings')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# What is the distribution of content durations for movies and TV shows?

In [None]:
subset_data = data.head(100)

# Create a histogram plot for the distribution of content durations
plt.figure(figsize=(10, 6))
sns.histplot(x='duration', data=subset_data,multiple='stack', hue='type', bins=20, palette='viridis')
plt.title('Distribution of Content Durations (First 100 Rows)')
plt.xticks(rotation=90, ha='right')
plt.show() 

In [None]:
# What are the top 10 most common content categories?

In [None]:
top_categories = data['listed_in'].value_counts().head(10)
print("Top 10 most common content categories:")
print(top_categories)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_categories.values, y=top_categories.index, palette='Set2')
plt.xlabel('Number of Appearances')
plt.ylabel('Content Categories')
plt.title('Top 10 Most Common Content Categories')
plt.tight_layout()
plt.show()

In [None]:
# What are the word frequencies in content descriptions?
# Most common words in Description 

In [None]:
from wordcloud import WordCloud

text = ' '.join(data['description'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='orange').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Content Descriptions')
plt.show() 

In [None]:
# Plot the distribution of genres

In [None]:
# Filter out non-string values and use explode to transform the 'genres' column
genre_counts = data['listed_in'].dropna().apply(lambda x: x.split(', ')).explode().value_counts()

plt.figure(figsize=(12, 8))
sns.barplot(x=genre_counts.index, y=genre_counts.values, palette='Set1')
plt.xlabel('Genres')
plt.ylabel('Count')
plt.title('Distribution of Genres')
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
%%html

<div style="background-color: purple; padding: 20px; text-align: center;">
    <h2>Top 10 Highest-Rated Shows/Movies</h2>
</div>


In [None]:
# Top 10 Highest-Rated Shows/Movies

In [None]:
sorted_df = data.sort_values(by='rating', ascending=False).head(10)

plt.figure(figsize=(12, 6))
plt.barh(sorted_df['title'], sorted_df['rating'], color = 'purple')
plt.xlabel('Rating')
plt.ylabel('Show/Movie Title')
plt.title('Top 10 Highest-Rated Shows/Movies')
plt.gca().invert_yaxis() 
plt.tight_layout()
plt.show()

### Data Pre-processing or Cleaning

In [None]:
# Checking Duplicates Data

In [None]:
data.duplicated().sum()

In [None]:
"""
Observation
💡 We can observe that we don't have any duplicates values in our training datasets.
💡 So we dont have any type of Data Lekage in our DataSet.
"""

In [None]:
# Checking Cardinality of Categorical features

In [None]:
data.select_dtypes(include='object').nunique()

In [None]:
"""
Observation
💡 We normally drop the features having high cardinality but in this project we will do Feature
Engineering and will create new features from this features.
💡 Because more amount of data leads to better predictions by model.
"""

### Lets deal with missing values

In [None]:
data.isnull().sum() 

In [None]:
# Checking Total Number & Percentage of Missing Values in Training Dataset

In [None]:
df = (data.isnull().sum()[data.isnull().sum() > 0]).to_frame().rename(columns={0:'Number of Missing Values'})
df['% of Missing Values'] = round(100 * data.isnull().sum()[data.isnull().sum() > 0] / len(data), 2)

In [None]:
"""
train_df.isnull().sum(): This counts the number of missing (NaN) values in each column of 
train_df.

[train_df.isnull().sum() > 0]: This filters the result to only include columns with more than 0 
missing values.

.to_frame(): This converts the filtered result into a DataFrame.

.rename(columns={0:'Number of Missing Values'}): This renames the column with index 0 to "Number of 
Missing Values".
"""

In [None]:
df

In [None]:
# Visualizing Missing Numbers

In [None]:
import missingno as msno

In [None]:
msno.bar(data, color='C1', fontsize=22)
plt.show()

In [None]:
# Another way to visualize missing Values

In [None]:
plt.figure(figsize=(14, 8))
sns.heatmap(data.isnull(), cmap='summer')
plt.show()

In [None]:
data.dtypes

In [None]:
# Handling Missing Values.

In [None]:
cat_cols = [col for col in data.columns if data[col].dtype=="object"]
num_cols = [col for col in data.columns if data[col].dtype!="object"] 

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
# Using Simple Imputer Library to Fill Missing Values

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer1 = SimpleImputer(strategy="most_frequent")   ##To fill Categorical Features.
imputer2 = SimpleImputer(strategy="median")            ##To fill numeircal features.

In [None]:
def fill_missing_no(df):
    df[cat_cols] = imputer1.fit_transform(df[cat_cols])
    df[num_cols] = imputer2.fit_transform(df[num_cols])

In [None]:
fill_missing_no(data)

In [None]:
# Missing numbers left in train_df

In [None]:
data.isnull().sum()

### Feature Encoding

In [None]:
# For Nominal data(have No order)  ==> One hot encoding
# For Ordinal data(have order) ==> Label enconding

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
data.dtypes

In [None]:
data['show_id'].nunique() # Label encoding

In [None]:
data['type'].unique()

In [None]:
data['type'].nunique()

In [None]:
data['type'].value_counts()

In [None]:
data['title'].unique()

In [None]:
data['title'].nunique() # Target guided encoding

In [None]:
data['director'].unique()

In [None]:
data['director'].nunique() # Target guided encoding

In [None]:
data['cast'].unique()

In [None]:
data['cast'].nunique() # Target guided encoding

In [None]:
data['country'].unique()

In [None]:
data['country'].nunique() # Target guided encoding

In [None]:
data['rating'].unique()

In [None]:
data['rating'].nunique() # Target guided encoding

In [None]:
data['duration'].unique()

In [None]:
data['duration'].nunique() # Target guided encoding

In [None]:
data['listed_in'].unique()

In [None]:
data['listed_in'].nunique() # Target guided encoding

In [None]:
data['description'].unique()

In [None]:
data['description'].nunique() # Target guided encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
for col in ['show_id', 'type']:  # Label Encoding
    data[col] = le.fit_transform(data[col])

In [None]:
data.head(3)

In [None]:
data.dtypes

In [None]:
def target_guided_encoding(i_col, t_col): # Target guided encoding
    i_col_index = data.groupby([i_col])[t_col].mean().sort_values().index
    i_col_dict = {key:index for index , key in enumerate(i_col_index , 0)}
    data[i_col] = data[i_col].map(i_col_dict) 

In [None]:
target_guided_encoding('title', 'type')

In [None]:
data['title'].nunique()

In [None]:
target_guided_encoding('director', 'type')

In [None]:
target_guided_encoding('cast', 'type')

In [None]:
target_guided_encoding('country', 'type')

In [None]:
data['country']

In [None]:
target_guided_encoding('rating', 'type')

In [None]:
target_guided_encoding('duration', 'type')

In [None]:
target_guided_encoding('listed_in', 'type')

In [None]:
target_guided_encoding('description', 'type')

In [None]:
data.head()

In [None]:
# data_a = pd.get_dummies(data['type'], prefix='type')

In [None]:
# data_a

### Lets Perform outlier detection !

In [None]:
def plot(df, col):
    # Create a larger figure
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 10))

    # Plot the distribution with kde
    sns.distplot(df[col], ax=ax1, color='b')
    ax1.set_title(f'Distribution of {col}')

    # Boxplot with custom whiskers
    sns.boxplot(df[col], ax=ax2, whis=1.5, color='g')
    ax2.set_title(f'Boxplot of {col}')

    # Distribution without kde (histogram)
    sns.histplot(df[col], ax=ax3, kde=False, bins=30, color='y')
    ax3.set_title(f'Histogram of {col}')

    # Apply log transformation if needed
    if (df[col] > 0).all():  # Check if all values are positive to avoid issues with log transformation
        ax1.set_yscale('log')
        ax3.set_yscale('log')

    plt.tight_layout()
    plt.show()

# Example usage:
# plot(your_dataframe, 'your_column')

In [None]:
plot(data, 'type')

In [None]:
q1 = data['type'].quantile(0.25)
q3 = data['type'].quantile(0.75)

iqr = q3- q1

maximum = q3 + 1.5*iqr
minimum = q1 - 1.5*iqr

In [None]:
maximum

In [None]:
minimum

In [None]:
[x for x in data['type'] if x> maximum or x<minimum] 

In [None]:
len([x for x in data['type'] if x> maximum or x<minimum]) # so, no outlier

### Lets Perform feature selection

In [None]:
X = data.drop(['type'] , axis=1)

In [None]:
y = data['type']

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
imp = mutual_info_regression(X, y)

In [None]:
imp

In [None]:
imp_df = pd.DataFrame(imp , index=X.columns)

In [None]:
imp_df

In [None]:
imp_df.columns = ['importance']

In [None]:
imp_df

In [None]:
imp_df.sort_values(by='importance' , ascending=False)

In [None]:
data.drop(['data_added_day', 'data_added_month'], axis = 1, inplace = True)

In [None]:
data.dtypes

In [None]:
data['type'].unique()

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled

### Lets Build ML model

### split dataset into train & test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Splitting Data For Model Which Don't Need Scaled Data.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
# Splitting Data For Model Which Need Scaled Data.

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

In [None]:
X_train1

In [None]:
X_train1.shape

In [None]:
y_train1.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
ml_model = RandomForestRegressor()

In [None]:
ml_model.fit(X_train , y_train)

In [None]:
y_pred = ml_model.predict(X_test)

In [None]:
y_pred

In [None]:
from sklearn import metrics

In [None]:
metrics.r2_score(y_test , y_pred)

### How to automate ml pipeline & How to define your Evaluation metric..

### how to make our own metric...

In [None]:
def mape(y_true , y_pred):
    y_true , y_pred = np.array(y_true) , np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100   

In [None]:
mape(y_test, y_pred)

### How to automate ml pipeline !

### Model Building For Scaled Data

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

In [None]:
evaluation_results = [] 

In [None]:
def evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    # Training the model
    model.fit(X_train, y_train)
    
    # Printing training score
    training_score = model.score(X_train, y_train)
    print(f'Training score : {training_score}')
    
    # Printing testing score
    testing_score = model.score(X_test, y_test)
    print(f'Testing score : {testing_score}')
    
    evaluation_results.append({
        "Model": model_name,
        "Training Score": training_score,
        "Testing Score": testing_score,
    })
    # Making predictions on the test set
    y_prediction = model.predict(X_test)
    
    # Printing predictions
    print(f'Predictions are : {y_prediction}')
    print('\n')
    
    # Calculating and printing evaluation metrics
    print(f'R2 score : {metrics.r2_score(y_test, y_prediction)}')
    print(f'MAE : {metrics.mean_absolute_error(y_test, y_prediction)}')
    print(f'MSE : {metrics.mean_squared_error(y_test, y_prediction)}')
    print(f'RMSE : {np.sqrt(metrics.mean_squared_error(y_test, y_prediction))}')
    print(f'MAPE : {mape(y_test, y_prediction)}')
    
    # Plotting the distribution of residuals with a colorful style
    plt.figure(figsize=(12, 6))
    sns.distplot(y_test - y_prediction, color='orange', hist_kws=dict(edgecolor="black", linewidth=1))
    plt.title('Residuals Distribution', fontsize=16, color='blue')
    plt.xlabel('Residuals', fontsize=12, color='green')
    plt.ylabel('Density', fontsize=12, color='purple')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xlim(-0.4, 0.4)
    plt.ylim(0, 50)
    plt.show()

    # Model prediction details with colorful text
    print("\n------------------------------------------------------------------------")
    print(f"Accuracy Score on Testing Data is: {accuracy_score(y_test, y_prediction) * 100:.2f}%")
    print(f"Precision Score is: {precision_score(y_test, y_prediction)}")
    print(f"Recall Score is: {recall_score(y_test, y_prediction)}")
    print(f"F1 Score is: {f1_score(y_test, y_prediction)}")
    
    # Confusion Matrix with a colorful heatmap
    print("\n------------------------------------------------------------------------")
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_prediction)
    plt.figure(figsize=(8, 4))
    sns.heatmap(cm, annot=True, fmt="g", cmap="viridis", linewidths=.5, cbar_kws={"shrink": 0.8})
    plt.show()

# Example usage:
# evaluate_model_colorful(your_model, X_train1, y_train1, X_test1, y_test1)


In [None]:
# 1. Logistic-Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
evaluate_model(LogisticRegression(),'Logistic Regression', X_train1, y_train1, X_test1, y_test1)

In [None]:
# 2. KNeighborsClassifier Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
evaluate_model(KNeighborsClassifier(), 'KNN', X_train1, y_train1, X_test1, y_test1)

In [None]:
# 3. Support-Vector-Classifier Model

In [None]:
from sklearn.svm import SVC

In [None]:
evaluate_model(SVC(), 'SVM', X_train1, y_train1, X_test1, y_test1)

In [None]:
# 4. Naive-Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
evaluate_model(GaussianNB(), 'Naive Bayes', X_train1, y_train1, X_test1, y_test1)

### Model Building For Un-Scaled Data

In [None]:
def evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    # Training the model
    model.fit(X_train, y_train)
    
    # Printing training score
    training_score = model.score(X_train, y_train)
    print(f'Training score : {training_score}')
    
    # Printing testing score
    testing_score = model.score(X_test, y_test)
    print(f'Testing score : {testing_score}')
    
    evaluation_results.append({
        "Model": model_name,
        "Training Score": training_score,
        "Testing Score": testing_score,
    })
    # Making predictions on the test set
    y_prediction = model.predict(X_test)
    
    # Printing predictions
    print(f'Predictions are : {y_prediction}')
    print('\n')
    
    # Calculating and printing evaluation metrics
    print(f'R2 score : {metrics.r2_score(y_test, y_prediction)}')
    print(f'MAE : {metrics.mean_absolute_error(y_test, y_prediction)}')
    print(f'MSE : {metrics.mean_squared_error(y_test, y_prediction)}')
    print(f'RMSE : {np.sqrt(metrics.mean_squared_error(y_test, y_prediction))}')
    print(f'MAPE : {mape(y_test, y_prediction)}')
    
    # Plotting the distribution of residuals with a colorful style
    plt.figure(figsize=(12, 6))
    sns.distplot(y_test - y_prediction, color='orange', hist_kws=dict(edgecolor="black", linewidth=1))
    plt.title('Residuals Distribution', fontsize=16, color='blue')
    plt.xlabel('Residuals', fontsize=12, color='green')
    plt.ylabel('Density', fontsize=12, color='purple')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xlim(-0.4, 0.4)
    plt.ylim(0, 50)
    plt.show()

    # Model prediction details with colorful text
    print("\n------------------------------------------------------------------------")
    print(f"Accuracy Score on Testing Data is: {accuracy_score(y_test, y_prediction) * 100:.2f}%")
    print(f"Precision Score is: {precision_score(y_test, y_prediction)}")
    print(f"Recall Score is: {recall_score(y_test, y_prediction)}")
    print(f"F1 Score is: {f1_score(y_test, y_prediction)}")
    
    # Confusion Matrix with a colorful heatmap
    print("\n------------------------------------------------------------------------")
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_prediction)
    plt.figure(figsize=(8, 4))
    sns.heatmap(cm, annot=True, fmt="g", cmap="viridis", linewidths=.5, cbar_kws={"shrink": 0.8})
    plt.show()

# Example usage:
# evaluate_model_colorful(your_model, X_train1, y_train1, X_test1, y_test1)


In [None]:
# 5. Decision-Tree-Classifier Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
evaluate_model(DecisionTreeClassifier(), 'Descision Tree', X_train, y_train, X_test, y_test)

In [None]:
# 6. Random-Forest-Classifier Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
evaluate_model(RandomForestClassifier(), 'Random Forest', X_train, y_train, X_test, y_test)

In [None]:
# 7. Ada-Boost-Classifier Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
evaluate_model(AdaBoostClassifier(), 'Ada Boost', X_train, y_train, X_test, y_test)

In [None]:
# 8. Gradient-Boosting-Classifier Model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
evaluate_model(GradientBoostingClassifier(), 'Gradient Boost', X_train, y_train, X_test, y_test)

In [None]:
# 9. LGMB Classifier Model

In [None]:
from lightgbm import LGBMClassifier

In [None]:
evaluate_model(LGBMClassifier(),'LGBM', X_train, y_train, X_test, y_test)

In [None]:
# 10. XGBClassifier Model

In [None]:
from xgboost import XGBClassifier

In [None]:
evaluate_model(XGBClassifier(),'XG Boost', X_train, y_train, X_test, y_test)

In [None]:
# 11. Cat-Boost-Classifier Model

In [None]:
from catboost import CatBoostClassifier

In [None]:
evaluate_model(CatBoostClassifier(),'Cat Boost', X_train, y_train, X_test, y_test)

### All Model Performance Comparison

In [None]:
df_results = pd.DataFrame(evaluation_results)

In [None]:
df_results

In [None]:
df_results.plot(x="Model",y=["Training Score","Testing Score"], figsize=(16,6),kind="bar",
        title="Performance Visualization of Different Models",colormap="Set3")
plt.show()

### Hyper-Parameter Tunning of LGBM Model

In [None]:
model1 = LGBMClassifier()

In [None]:
parameters1 = {"n_estimators":[100,300,500,600,650],
              "learning_rate":[0.01,0.02,0.03],
              "random_state":[0,42,48,50],
               "num_leaves":[16,17,18]}     

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# When search space is small & computational resources are enough ==> Grid Search
# When search space is large & computational resources are limited ==> Random Search

In [None]:
grid_search1 = GridSearchCV(model1, parameters1, cv=5, n_jobs=-1)

In [None]:
grid_search1.fit(X_train,y_train.values.ravel())

In [None]:
grid_search1.best_score_

In [None]:
best_parameters1 = grid_search1.best_params_
best_parameters1

In [None]:
# Creating LGBM Model Using Best Parameters.

In [None]:
model1 = LGBMClassifier(**best_parameters1)

In [None]:
model1.fit(X_train,y_train)

In [None]:
X_test_pred1 = model1.predict(X_test)

In [None]:
accuracy_score(y_test,X_test_pred1)

### Hyper-Parameter Tunning of CatBoost Model

In [None]:
model2 = CatBoostClassifier(verbose=False)

In [None]:
parameters2 = {"learning_rate":[0.1,0.3,0.5,0.6,0.7],
              "random_state":[0,42,48,50],
               "depth":[8,9,10],
               "iterations":[35,40,50]}

In [None]:
grid_search2 = GridSearchCV(model2, parameters2, cv=5, n_jobs=-1)

In [None]:
grid_search2.fit(X_train,y_train)

In [None]:
grid_search2.best_score_

In [None]:
best_parameters2 = grid_search2.best_params_
best_parameters2

In [None]:
# Creating Cat Boost Model Using Best Parameters

In [None]:
model2 = CatBoostClassifier(**best_parameters2,verbose=False)

In [None]:
model2.fit(X_train,y_train)

In [None]:
X_test_pred2 = model2.predict(X_test)

In [None]:
accuracy_score(y_test,X_test_pred2)

### Hyper-Parameter Tunning of XGBoost Model

In [None]:
model3 = XGBClassifier()

In [None]:
parameters3 = {"n_estimators":[50,100,150],
             "random_state":[0,42,50],
             "learning_rate":[0.1,0.3,0.5,1.0]}

In [None]:
grid_search3 = GridSearchCV(model3, parameters3 , cv=5, n_jobs=-1)

In [None]:
grid_search3.fit(X_train,y_train)

In [None]:
grid_search3.best_score_

In [None]:
best_parameters3 = grid_search3.best_params_
best_parameters3

In [None]:
# Creating XGBoost Model Using Best Parameters

In [None]:
model3 = XGBClassifier(**best_parameters3)

In [None]:
model3.fit(X_train,y_train)

In [None]:
X_test_pred3 = model3.predict(X_test)

In [None]:
accuracy_score(y_test,X_test_pred3) 

### Hyper Parameter Tunning of RandomForest Model

In [None]:
model4 = RandomForestClassifier()

In [None]:
parameters4 = {'n_estimators': [100,300,500,550],
               'min_samples_split':[7,8,9],
               'max_depth': [10,11,12], 
               'min_samples_leaf':[4,5,6]}

In [None]:
grid_search4 = GridSearchCV(model4, parameters4, cv=5, n_jobs=-1)

In [None]:
grid_search4.fit(X_train,y_train.values.ravel())

In [None]:
grid_search4.best_score_

In [None]:
best_parameters4 = grid_search4.best_params_
best_parameters4

In [None]:
# Creating Random Forest Model Using Best Parameters

In [None]:
model4 = RandomForestClassifier(**best_parameters4)

In [None]:
model4.fit(X_train,y_train)

In [None]:
X_test_pred4 = model4.predict(X_test)

In [None]:
accuracy_score(y_test,X_test_pred4)

### Stacking Classifier Model

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
stacking_model = StackingClassifier(estimators=[('LGBM', model1), 
                                                ('CAT Boost', model2),
                                                ("XGBoost", model3),
                                                ('RF', model4)])

In [None]:
stacking_model.fit(X_train, y_train)

In [None]:
X_train_pred5 = stacking_model.predict(X_train)

In [None]:
X_test_pred5 = stacking_model.predict(X_test)

In [None]:
print(f"Stacking Model accuracy on Training Data is: {accuracy_score(y_train,X_train_pred5)*100:.2f}%")

In [None]:
print(f"Stacking Model accuracy on Testing Data is: {accuracy_score(y_test,X_test_pred5)*100:.2f}%")