# Load Data And Fine Na

In [None]:
import time
import pandas as pd

print("Loading data file now, this could take a while depending on file size")
start = time.time()
df = pd.read_csv('') # ADD-CSV
end = time.time()
print("Loading took " + str(round(end - start, 2)) + " seconds")
missing_values = df.isna().sum().sum()
duplicated_values = df.duplicated().sum()
print(f'\nMissing values: {missing_values}')
print(f'Duplicated values: {duplicated_values}')
if missing_values >= 1:
    print('\nMissing values by column:')
    print(df.isna().sum())
print("\nUnique Values in Each Column:")
print(df.nunique())

## แยกข้อมูล ตัวเลขและStr

In [None]:
# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numerical_columns = df.select_dtypes(include=['object']).columns.tolist()

# Display the lists of numerical and categorical columns
print("\nNumerical Columns:", numerical_columns)
print("Categorical Columns:", non_numerical_columns)

### หาการกระจากตัวของข้อมูล
เส้นกลางกล่อง: ค่ามัธยฐาน (Median)

กรอบกล่อง (IQR): แสดงช่วงระหว่างควอไทล์ที่ 1 (25%) และควอไทล์ที่ 3 (75%)

เส้นขีด: แสดงค่าข้อมูลที่อยู่ในช่วงไม่เกิน 1.5 เท่าของ IQR

จุด (Outliers): ค่าที่อยู่ไกลเกินกว่า 1.5 เท่าของ IQR

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
def plot_histograms(df, numerical_columns):
    for column in numerical_columns:
        plt.figure(figsize=(8, 5))
        # Create a histogram with KDE
        sns.histplot(df[column], bins=30, kde=True, color='#ff8c00')  
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()


# numerical_columns = ['TransactionAmount', 'CustomerAge', 'TransactionDuration', 'LoginAttempts', 'AccountBalance']
# plot_histograms(df, numerical_columns)

## หาความสัมพันธ์ของข้อมูลตัวเลข

จะดูแค่จากฝั่งซ้ายล่าง ถ้ามีความสัมพันธ์กันจะมีค่าใกล้ 1 ขึ้น แต่ถ้าไม่เกี่ยวข้องกันเลยจะมีค่าใกล้ -1

In [None]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
correlation_matrix = df[numerical_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix for Numerical Columns')
plt.show()

## ดูเนื้อหาประเภทหมวดหมู่

In [None]:
print("\nUnique Values in Each Column:")
print(df.nunique())

In [None]:
def plot_categorical_distributions(columns, data=df, palette='muted'):
    plt.figure(figsize=(18, 6))  
    
    # Loop through each column and create a pie chart
    for i, column_name in enumerate(columns):
        plt.subplot(1, 3, i + 1)  
        value_counts = data[column_name].value_counts()
        value_counts.plot.pie(autopct='%1.1f%%', colors=sns.color_palette(palette), 
                               startangle=90, explode=[0.05] * value_counts.nunique())
        
        plt.title(f'Percentage Distribution of {column_name}')
        plt.ylabel('')  
    
    plt.tight_layout()
    plt.show()


# columns_to_plot = ['TransactionType', 'Channel', 'CustomerOccupation']
# plot_categorical_distributions(columns_to_plot)

### ดูความถี่จำพวกหมวดหมู่

In [None]:
# Define a function to create bar plots, boxplots, and violin plots for relationships between a categorical variable and TransactionAmount
def plot_categorical_relationships(column_name='column', data=df, target_column='', title_prefix='Relationship'):
    plt.figure(figsize=(16, 5))
    
    # Bar Plot
    plt.subplot(1, 3, 1)
    avg_data = data.groupby(column_name)[target_column].mean().sort_values(ascending=False)
    sns.barplot(x=avg_data.index, y=avg_data.values, palette='muted')
    plt.title(f'{title_prefix} - Average {target_column} by {column_name}')
    plt.xlabel(column_name)
    plt.ylabel(f'Average {target_column}')
    plt.xticks(rotation=45)
    
    # Boxplot
    plt.subplot(1, 3, 2)
    sns.boxplot(x=column_name, y=target_column, data=data, palette='muted')
    plt.title(f'{title_prefix} - {target_column} Distribution by {column_name}')
    plt.xlabel(column_name)
    plt.ylabel(target_column)
    plt.xticks(rotation=45)
    
    # Violin Plot
    plt.subplot(1, 3, 3)
    sns.violinplot(x=column_name, y=target_column, data=data, palette='muted')
    plt.title(f'{title_prefix} - {target_column} Violin Plot by {column_name}')
    plt.xlabel(column_name)
    plt.ylabel(target_column)
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

# Example usage with default parameters
target_categorical_variables = ['alcohol', 'malic_acid', 'ash']
for column in target_categorical_variables:
    plot_categorical_relationships(column_name=column, target_column=column)


## ดู 10 อันดับที่มากที่สุด

In [None]:
def plot_top_categories_pie_charts(columns, data=df, palette='muted'):
    plt.figure(figsize=(len(columns) * 6, 6))  
    
    # Loop through each column and create a pie chart
    for i, column_name in enumerate(columns):
        plt.subplot(1, len(columns), i + 1)  
        
        # Get the top 10 categories based on frequency
        top_categories = data[column_name].value_counts().nlargest(10)
        
        # Plot the pie chart
        top_categories.plot.pie(autopct='%1.1f%%', colors=sns.color_palette(palette), 
                                 startangle=90, explode=[0.05] * len(top_categories))
        
        plt.title(f'Top 10 Categories of {column_name}')
        plt.ylabel('')  
    
    plt.tight_layout()
    plt.show()


# columns_to_plot = ['Location', 'MerchantID']
# plot_top_categories_pie_charts(columns_to_plot)

## ดูความถี่ 10 อันดับที่มากที่สุด

column_name : ตัวแปรจำพวกหมวดหมู่ เช่น จังหวัด (กรุงเทพ, เชียงใหม่, ภูเก็ต) ระดับการศึกษา (ม.6, ป.ตรี, ป.โท) เพศ (Male, Female)

target_column : ตัวแปรเชิงปริมาณ ที่เราต้องการศึกษา เช่น ยอดขาย (Sales) คะแนนความพึงพอใจ (Rating) อายุ (Age) ราคา (Price)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Union
import numpy as np


def plot_categorical_relationships(
    column_name: str='', 
    data: pd.DataFrame = df,  
    target_column: str='', 
    top_n: int = 10, 
    title_prefix: str = 'Analysis', 
    figsize: tuple = (18, 5), 
    palette: str = 'muted', 
    rotation: int = 45, 
    min_count: int = 1
) -> None:

    # Input validation
    if not isinstance(data, pd.DataFrame):
        raise ValueError("data must be a pandas DataFrame")
    if column_name not in data.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame")
    if target_column not in data.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame")
    if top_n < 1:
        raise ValueError("top_n must be positive")
    
    # Filter categories with minimum count
    value_counts = data[column_name].value_counts()
    valid_categories = value_counts[value_counts >= min_count].head(top_n).index
    
    # Handle case when no categories meet criteria
    if len(valid_categories) == 0:
        print("No categories meet the minimum count criterion")
        return
    
    # Filter data
    filtered_data = data[data[column_name].isin(valid_categories)].copy()
    
    # Sort categories by frequency
    category_order = filtered_data[column_name].value_counts().index
    
    # Create figure
    plt.figure(figsize=figsize)
    
    # Bar Plot
    plt.subplot(1, 3, 1)
    avg_value = filtered_data.groupby(column_name)[target_column].agg(['mean', 'count'])
    sns.barplot(x=avg_value.index, y='mean', data=avg_value, order=category_order, palette=palette)
    plt.title(f'{title_prefix}\nAverage {target_column} by {column_name}\n(Top {len(valid_categories)})')
    plt.xlabel(f'{column_name}\n(n=samples in each category)')
    plt.ylabel(f'Average {target_column}')
    plt.xticks(rotation=rotation)
    
    # Add sample size annotations
    for i, count in enumerate(avg_value['count']):
        plt.text(i, 0, f'n={count}', ha='center', va='bottom')
    
    # Boxplot
    plt.subplot(1, 3, 2)
    sns.boxplot(x=column_name, y=target_column, data=filtered_data, 
                order=category_order, palette=palette)
    plt.title(f'{title_prefix}\n{target_column} Distribution by {column_name}')
    plt.xlabel(column_name)
    plt.ylabel(target_column)
    plt.xticks(rotation=rotation)
    
    # Violin Plot
    plt.subplot(1, 3, 3)
    sns.violinplot(x=column_name, y=target_column, data=filtered_data,order=category_order, palette=palette)
    plt.title(f'{title_prefix}\n{target_column} Distribution (Violin) by {column_name}')
    plt.xlabel(column_name)
    plt.ylabel(target_column)
    plt.xticks(rotation=rotation)
    
    # Adjust layout and display
    plt.tight_layout()
    plt.show()

# plot_categorical_relationships('target', target_column='malic_acid')
# plot_categorical_relationships('alcohol', target_column='nonflavanoid_phenols')
