# Load Dataset

In [1]:
import pandas as pd
import numpy as np
#-----------------------------
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv()

# Data Cleaning

In [None]:
# format columns name
def format_column_names(df):
    """
    Format all column names in a DataFrame to snake_case.

    Parameters:
        df (pandas.DataFrame): DataFrame whose column names are to be formatted.

    Returns:
        pandas.DataFrame: DataFrame with column names formatted to snake_case.
    """
    formatted_columns = [col.replace(' ', '_').lower() for col in df.columns]
    df.columns = formatted_columns
    return df

In [None]:
df.apply(lambda col: col.unique())

In [1]:
# Remove Outlier
def remove_outliers(df, column_names=None):
    """
    Remove outliers from specific columns in the DataFrame based on the interquartile range (IQR) method,
    or remove outliers from all numerical columns if column_names is None.

    Parameters:
    - df: DataFrame
        The DataFrame containing the data.
    - column_names: list or None, default None
        The list of column names for which outliers are to be removed,
        or None to remove outliers from all numerical columns.

    Returns:
    - df_filtered: DataFrame
        The DataFrame with outliers removed.
    """
    if column_names is None:
        numerical_columns = df.select_dtypes(include='number').columns
    else:
        numerical_columns = column_names

    total_removed = 0
    total_rows = len(df)

    for col in numerical_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        # Define the lower and upper bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Remove outliers from the specified column
        removed_rows = len(df) - len(df[(df[col] >= lower_bound) & (df[col] <= upper_bound)])
        total_removed += removed_rows

        # Update DataFrame
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

        # Print the number and percentage of removed values if any rows have been removed
        percentage_removed = (removed_rows / total_rows) * 100
        print(f"Removed {removed_rows} rows ({percentage_removed:.2f}%) due to outliers in column '{col}'.")

    return df

In [2]:
def scale_dataframe(df, columns_to_scale=None):
    """
    Scale the specified columns in the DataFrame using Min-Max scaling.
    
    Parameters:
        df (pandas.DataFrame): DataFrame to be scaled.
        columns_to_scale (list): List of columns to be scaled. If None, scale all numerical columns.
        
    Returns:
        pandas.DataFrame: Scaled DataFrame.
    """
    scaler = MinMaxScaler()
    if columns_to_scale is None:
        columns_to_scale = df.select_dtypes(include=['number']).columns.tolist()
    df_scaled = df.copy()
    df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])
    return df_scaled

# EDA

In [None]:
categorical_columns = ['cash_advance_trx', 'tenure', 'full_payment']
numerical_columns = [col for col in df.columns if col not in categorical_columns]

## Univariate

In [None]:
# Plotting subplots
num_rows = 3
num_cols = (len(numerical_columns) + len(categorical_columns) + num_rows - 1) // num_rows
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 10))

for i, column in enumerate(numerical_columns):
    row = i // num_cols
    col = i % num_cols
    sns.histplot(df[column], ax=axes[row][col], kde=True)
    axes[row][col].set_title(column)

for i, column in enumerate(categorical_columns):
    row = (i + len(numerical_columns)) // num_cols
    col = (i + len(numerical_columns)) % num_cols
    sns.countplot(data=df, x=column, ax=axes[row][col])
    axes[row][col].set_title(column)

plt.tight_layout()
plt.show()

In [None]:
num_rows = 3
num_cols = len(numerical_columns) // num_rows + (len(numerical_columns) % num_rows > 0)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(8 * num_cols, 6 * num_rows))
axes = axes.ravel()  # Flatten the axes array

for i, col in enumerate(numerical_columns):
    ax = axes[i]
    df.boxplot(column=col, ax=ax)
    ax.set_title(f'Box plot for {col}')
    ax.set_ylabel('Values')

plt.tight_layout()
plt.show()

# Bivariate

In [None]:
sns.pairplot(df[[numerical_columns]])
plt.title('Pairplot of Numerical Variables')
plt.show()

## Multivariate

# Model

## Kmeans Clustering

In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [7]:
#Determine the number of clusters
def calculate_inertia(df): 
    inertia_t = []
    for i in range(1, 12):
        km = KMeans(n_clusters=i).fit(df)
        inertia_t.append(km.inertia_)

    #Plot to check the suggested number of clusters
    plt.figure(figsize=(20,5))
    plt.subplot(1,2,2)
    sns.lineplot(x=range(1,12), y=inertia_t)
    plt.title('KMeans inertia on transformed data')
    plt.show()

In [6]:
def calculate_silhouette(df):
    silhouette_scores = []
    for i in range(2, 12):  # Considering clusters from 2 to 11
        km = KMeans(n_clusters=i)
        km.fit(df)
        silhouette_scores.append(silhouette_score(df, km.labels_))

    #Plot to check the suggested number of clusters
    plt.figure(figsize=(20,5))
    plt.subplot(1,2,2)
    sns.lineplot(x=range(2,12), y=silhouette_scores)
    plt.title('KMeans silhouette scores on transformed data')
    plt.show()

## ARIMA


In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

def plot_acf_pacf(series, lags=None):
    # Plot ACF
    plt.figure(figsize=(12, 6))
    plot_acf(series, lags=lags, ax=plt.gca())
    plt.title('Autocorrelation Function (ACF)')
    plt.xlabel('Lag')
    plt.ylabel('Autocorrelation')
    plt.show()
    
    # Plot PACF
    plt.figure(figsize=(12, 6))
    plot_pacf(series, lags=lags, ax=plt.gca())
    plt.title('Partial Autocorrelation Function (PACF)')
    plt.xlabel('Lag')
    plt.ylabel('Partial Autocorrelation')
    plt.show()

# Example usage:
# Assuming df is your DataFrame and 'Oil' is the column you want to model
plot_acf_pacf(df['Oil'], lags=30)  # You can adjust the number of lags as needed

In [None]:
def differentiate_and_plot(series, diff_order):
    # Differentiate the series the specified number of times
    differentiated_series = series.diff(diff_order).dropna()
    
    # Plot the differentiated series
    plt.figure(figsize=(10, 6))
    plt.plot(differentiated_series.index, differentiated_series.values, label=f'{diff_order}th Order Difference')
    plt.title(f'{diff_order}th Order Difference of Time Series')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

    # Perform ADF test to check stationarity
    result = adfuller(differentiated_series)
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'   {key}: {value}')
    if result[1] < 0.05:
        print("The differentiated series is likely stationary (reject the null hypothesis)")
    else:
        print("The differentiated series is likely non-stationary (fail to reject the null hypothesis)")

In [None]:
def create_arima_model(df, column_name, p, d, q):
    # Convert the DataFrame column to a pandas Series
    series = df[column_name]
    
    # Split the data into training and testing sets (80-20 split)
    train_size = int(len(series) * 0.8)
    train, test = series[:train_size], series[train_size:]
    
    # Fit ARIMA model
    model = ARIMA(train, order=(p, d, q))
    fitted_model = model.fit()
    
    # Make predictions
    predictions = fitted_model.forecast(steps=len(test))
    
    # Visualize the training, testing, and predictions
    plt.figure(figsize=(10, 6))
    plt.plot(train.index, train.values, label='Training Data')
    plt.plot(test.index, test.values, label='Actual Test Data')
    plt.plot(test.index, predictions, color='red', label='Predicted Test Data')
    plt.title('ARIMA Model Predictions vs Actual')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

    #Forecasting another 24 months 
    final_model = ARIMA(df, order=(p,d,q)).fit()
    prediction=final_model.predict(len(df),len(df)+24)

    df.plot(legend=True, label='Train', figsize=(10,6))
    prediction.plot(legend=True, label='prediction')
    
    # Calculate and print RMSE
    rmse = np.sqrt(mean_squared_error(test, predictions))
    print("Root Mean Squared Error (RMSE):", rmse)

## AR

In [None]:

def differentiate_series(series, diff_order):
    # Differentiate the series the specified number of times
    differentiated_series = series.diff(diff_order).dropna()
    return differentiated_series

def create_ar_model(df, column_name, p, diff_order=0):
    # Convert the DataFrame column to a pandas Series
    series = df[column_name]
    
    # Plot the original series before differentiation
    plt.figure(figsize=(10, 6))
    plt.plot(series.index, series.values, label='Original Data')
    plt.title('Original Data')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.show()
    
    # Differentiate the series if diff_order is specified
    if diff_order > 0:
        differentiated_series = differentiate_series(series, diff_order)
    else:
        differentiated_series = series
    
    # Split the data into training and testing sets (80-20 split)
    train_size = int(len(differentiated_series) * 0.8)
    train, test = differentiated_series[:train_size], differentiated_series[train_size:]
    
    # Fit AR model
    model_ar = AutoReg(train, lags=p)
    fitted_model_ar = model_ar.fit()
    
    # Make predictions for differentiated series
    predictions_ar_diff = fitted_model_ar.predict(start=len(train), end=len(train)+len(test)-1)
    
    # Visualize the training, testing, and predictions for differentiated series
    plt.figure(figsize=(10, 6))
    plt.plot(train.index, train.values, label='Training Data')
    plt.plot(test.index, test.values, label='Actual Test Data')
    plt.plot(test.index, predictions_ar_diff, color='blue', label='Predicted Test Data (AR)')
    plt.title('AR Model Predictions vs Actual (Differentiated Series)')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

    # Forecasting another 24 months using AR for differentiated series
    final_model_ar_diff = AutoReg(differentiated_series, lags=p).fit()
    prediction_ar_diff = final_model_ar_diff.predict(start=differentiated_series.index[-1], end=differentiated_series.index[-1] + pd.DateOffset(months=24))

    # Integrate the forecasted differentiated values into the original series
    prediction_ar = series.iloc[-1] + np.cumsum(prediction_ar_diff)

    # Plotting AR prediction for the next 24 months on the original series
    plt.figure(figsize=(10, 6))
    plt.plot(series.index, series.values, label='Original Data')
    plt.plot(prediction_ar.index, prediction_ar.values, color='purple', label='Forecast (AR)')
    plt.title('AR Model Forecast for the Next 24 Months (Original Series)')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Calculate and print RMSE
    rmse = np.sqrt(mean_squared_error(test, predictions_ar_diff))
    print("Root Mean Squared Error (RMSE) for differentiated series:", rmse)

    # Perform ADF test to check stationarity for differentiated series
    result = adfuller(differentiated_series)
    print('ADF Statistic for differentiated series:', result[0])
    print('p-value for differentiated series:', result[1])
    print('Critical Values for differentiated series:')
    for key, value in result[4].items():
        print(f'   {key}: {value}')
    if result[1] < 0.05:
        print("The differentiated series is likely stationary (reject the null hypothesis)")
    else:
        print("The differentiated series is likely non-stationary (fail to reject the null hypothesis)")
