# Title: ABC

Problem Statement (What we are predicting/clustering?)

Type of Machine Learning (Regression/Classification/Clustering)

Success Metrices (RMSE, R2, Accuracy, Precision, Recall etc.)

Constraints or Assumptions

## Enviroment setup

In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import *

import warnings
warnings.filterwarnings('ignore')

## Data loading

In [None]:
df = pd.read_csv('path_to_data.csv')
df.head()

In [None]:
df.shape

from the df.shape, write how many features and rows we have in a markdown as a conclusion of this step

## Data Understanding - EDA

Basic info

In [None]:
df.info()

how many null values in each features

In [None]:
df.isna().sum()

identifying the numerical columns and categorical columns in our dataframe

In [None]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

print(f'Numerical Columns: {numerical_columns}')
print(f'Categorical Columns: {categorical_columns}')

statistical inference of numerical features

In [None]:
df.describe()

### Visuals

distribution of our target variable

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['target_variable'], bins=30, kde=True)
plt.title('Distribution of Target Variable')
plt.show()

now let's visualise our categorical columns

In [None]:
def eda_plots(df):
    """
    Plots:
    1. Pie charts for all categorical columns
    2. Histograms for all numerical columns
    3. Boxplots for all numerical columns

    Parameters:
    df (pd.DataFrame): Input dataframe
    """

    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

    # -----------------------------
    # Pie Charts (Categorical)
    # -----------------------------
    if len(categorical_cols) > 0:
        n_cols = 3
        n_rows = math.ceil(len(categorical_cols) / n_cols)

        plt.figure(figsize=(5 * n_cols, 5 * n_rows))
        for i, col in enumerate(categorical_cols, 1):
            plt.subplot(n_rows, n_cols, i)
            values = df[col].value_counts().head(5)
            plt.pie(values, labels=values.index, autopct='%1.1f%%', startangle=140)
            plt.ylabel('')
            plt.title(f"distribution of {col}")

        plt.suptitle('Categorical Columns - Pie Charts', fontsize=16)
        plt.tight_layout()
        plt.show()

    # -----------------------------
    # Histograms (Numerical)
    # -----------------------------
    if len(numerical_cols) > 0:
        n_cols = 3
        n_rows = math.ceil(len(numerical_cols) / n_cols)

        plt.figure(figsize=(5 * n_cols, 4 * n_rows))
        for i, col in enumerate(numerical_cols, 1):
            plt.subplot(n_rows, n_cols, i)
            plt.hist(df[col].dropna(), bins=30)
            plt.title(f'Histogram of {col}')
            plt.xlabel(col)
            plt.ylabel('Frequency')

        plt.suptitle('Numerical Columns - Histograms', fontsize=16)
        plt.tight_layout()
        plt.show()

    # -----------------------------
    # Boxplots (Numerical)
    # -----------------------------
    if len(numerical_cols) > 0:
        n_cols = 3
        n_rows = math.ceil(len(numerical_cols) / n_cols)

        plt.figure(figsize=(5 * n_cols, 4 * n_rows))
        for i, col in enumerate(numerical_cols, 1):
            plt.subplot(n_rows, n_cols, i)
            plt.boxplot(df[col].dropna())
            plt.title(f'Boxplot of {col}')
            plt.ylabel(col)

        plt.suptitle('Numerical Columns - Boxplots', fontsize=16)
        plt.tight_layout()
        plt.show()

eda_plots(df)

pairplot of our dataframe to check correlation of each feature with each other feature

In [None]:
sns.pairplot(df)

## Data preprocessing

first make a copy of our dataframe so that our original dataframe is as it is

In [None]:
df_copy = df.copy(deep=True)

now we have to remove the null values from our data which is very common in a lot of datasets