In [None]:
#FOR ANALYSIS/CLEANING/COMPUTATION:
import pandas as pd
import numpy as np

#FOR VISUALIZATION:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
import time
import pandas as pd

print("Loading data file now, this could take a while depending on file size")
start = time.time()
df = pd.read_csv('DATASET.csv') # ADD-CSV
end = time.time()
print("Loading took " + str(round(end - start, 2)) + " seconds")
missing_values = df.isna().sum().sum()
duplicated_values = df.duplicated().sum()
print(f'\nMissing values: {missing_values}')
print(f'Duplicated values: {duplicated_values}')
if missing_values >= 1:
    print('\nMissing values by column:')
    print(df.isna().sum())
print("\nUnique Values in Each Column:")
print(df.nunique())

In [None]:
def plot_levels_overtime(df=df, x_column=None, y_column=None):
    plt.figure(figsize=(10, 6))
    plt.plot(df[x_column], df[y_column], marker='o', markersize=5, markerfacecolor='red', color='blue', label=y_column)
    plt.xlabel(x_column)
    plt.ylabel(y_column)
    plt.title(f"{y_column} Levels Over Time")
    plt.grid(True)
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
import random

def plot_seasonal_decompose(df, column_name, model='additive', period=None):
    """
    Perform seasonal decomposition on a time series and plot the results.
    Parameters:
    df (DataFrame): The input DataFrame containing the time series data.
    column_name (str): The name of the column in the DataFrame to decompose.
    model (str): The type of decomposition ('additive' or 'multiplicative'). Default is 'additive'.
    period (int): The number of observations per cycle. If None, it will be inferred.
    Returns:
    None
    """
    # Perform seasonal decomposition
    result = seasonal_decompose(df[column_name], model=model, period=period)
    trend = result.trend.dropna()
    seasonal = result.seasonal.dropna()
    residual = result.resid.dropna()

    # Generate random colors for each plot
    base_color = [random.random() for _ in range(3)]  # Generate a random RGB color

    # Plotting the decomposition with random colors
    plt.figure(figsize=(10, 10))
    plt.suptitle('Seasonal Decomposition of ' + column_name, fontsize=16)
    plt.subplot(4, 1, 1)
    plt.plot(df[column_name], color=base_color, label='Original Series')
    plt.legend(loc='upper left')
    
    plt.subplot(4, 1, 2)
    plt.plot(trend, color=base_color, label='Trend')
    plt.legend(loc='upper left')

    plt.subplot(4, 1, 3)
    plt.plot(seasonal, color=base_color, label='Seasonal')
    plt.legend(loc='upper left')

    plt.subplot(4, 1, 4)
    plt.plot(residual, color=base_color, label='Residuals')
    plt.legend(loc='upper left')

    plt.tight_layout()
    plt.show()


In [1]:
import matplotlib.pyplot as plt

def plot_train_test_split(df, index_column, target_column, train_size=0.8, figsize=(15, 8)):
    """
    Parameters:
    - df: DataFrame ที่มีข้อมูล
    - index_column: ชื่อคอลัมน์ที่จะตั้งเป็น index
    - target_column: ชื่อคอลัมน์เป้าหมาย (target) ที่ต้องการวิเคราะห์
    - train_size: สัดส่วนของข้อมูลที่ใช้สำหรับการฝึก (ค่าเริ่มต้น: 0.8 หรือ 80% ของข้อมูล)
    - figsize: ขนาดของกราฟ (ค่าเริ่มต้น: (15, 8))
    """
    # ตรวจสอบคอลัมน์
    if index_column not in df.columns:
        raise ValueError(f"'{index_column}' not found in the DataFrame columns.")
    if target_column not in df.columns:
        raise ValueError(f"'{target_column}' not found in the DataFrame columns.")
    if df[index_column].duplicated().any():
        raise ValueError(f"'{index_column}' contains duplicate values. Please ensure it is unique.")
    
    # ตั้งค่าคอลัมน์ที่ใช้เป็น index
    df = df.set_index(index_column, drop=False)
    
    # คำนวณจุดแบ่งข้อมูล
    split_point = int(len(df) * train_size)
    
    # แยกข้อมูลการฝึก (train) และการทดสอบ (test)
    train_data = df.iloc[:split_point][[target_column]].rename(columns={target_column: 'train'})
    test_data = df.iloc[split_point:][[target_column]].rename(columns={target_column: 'test'})

    # รวมข้อมูล train และ test
    combined_data = train_data.join(test_data, how='outer')

    # วาดกราฟ
    combined_data.plot(y=['train', 'test'], figsize=figsize, fontsize=12)
    plt.xlabel(index_column, fontsize=12)
    plt.ylabel(target_column, fontsize=12)
    plt.title(f"Train-Test Split of {target_column} by {index_column}", fontsize=14)
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

    # คืนค่าข้อมูล train และ test
    train = df.copy().iloc[:split_point][[target_column]].reset_index(drop=True)
    test = df.copy().iloc[split_point:][[target_column]].reset_index(drop=True)
    print('Training data shape: ', train.shape)
    print('Test data shape: ', test.shape)
    return train, test
