In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Loading data
data = pd.read_csv("data_ak.csv")

#Inspecting Statistics
print("Dataset Overview:")
print(data.info())
print("Summary Statistics:")
print(data.describe())

# Handle missing values (imputation or removal)

# Separating numerical and categorical columns
num_cols = data.select_dtypes(include=['number']).columns
cat_cols = data.select_dtypes(include=['object']).columns

# Filling missing values for numerical columns with mean
data[num_cols] = data[num_cols].fillna(data[num_cols].mean())

# Filling missing values for categorical columns with mode
data[cat_cols] = data[cat_cols].fillna(data[cat_cols].mode().iloc[0]) #used iloc[0] if more than 1 mode are present then take the 1st one

# Identify and remove duplicate records
data.drop_duplicates(inplace=True)

#Removing outliers
Q1 = data[num_cols].quantile(0.25)
Q3 = data[num_cols].quantile(0.75)
IQR = Q3 - Q1
outlier_mask = (data[num_cols] < (Q1 - 1.5 * IQR)) | (data[num_cols] > (Q3 + 1.5 * IQR))
data[num_cols] = data[num_cols].mask(outlier_mask, np.nan)# Replace outliers with NaN for imputation
data[num_cols] = data[num_cols].fillna(data[num_cols].mean())  # Impute outliers with mean values

# Standardize categorical values
if 'category_column' in data.columns:
    data['category_column'] = data['category_column'].str.strip().str.lower()
    data['category_column'] = data['category_column'].replace({'typo1': 'correct_value', 'typo2': 'correct_value'})


# Univariate Analysis
print("Univariate Analysis:")
#Summary statistics (mean, median, mode, variance, skewness, etc.).
print(data[num_cols].describe())

print("\nMode of numerical columns:")
print(data[num_cols].mode().iloc[0])

print("\nVariance:")
print(data[num_cols].var())

print("\nSkewness:")
print(data[num_cols].skew())
#Frequency distributions for categorical variables.
for col in cat_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=data[col])
    plt.title(f'Frequency Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()
#Histograms and box plots to visualize distributions.
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[col], kde=True)
    plt.title(f'Histogram of {col}')
    plt.show()

    sns.boxplot(x=data[col])
    plt.title(f'Boxplot of {col}')
    plt.show()



# Bivariate Analysis
print("Bivariate Analysis:")
#Correlation matrix to identify relationships between numerical variables.
correlation_matrix = data[num_cols].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

#Scatter Plots for Continuous Variable Relationships
for i in range(len(num_cols)):
    for j in range(i + 1, len(num_cols)):
        plt.figure(figsize=(8, 4))
        sns.scatterplot(x=data[num_cols[i]], y=data[num_cols[j]])
        plt.title(f'Scatter Plot: {num_cols[i]} vs {num_cols[j]}')
        plt.show()

#Bar Plots & Violin Plots & Box Plot for Categorical vs Numerical Variables
for col in num_cols:
    for cat in cat_cols:
        # Bar Plot
        plt.figure(figsize=(8, 4))
        sns.barplot(x=data[cat], y=data[col], ci=None)
        plt.title(f'Bar Plot of {col} by {cat}')
        plt.xticks(rotation=45)
        plt.show()
        # Violin Plot
        plt.figure(figsize=(8, 4))
        sns.violinplot(x=data[cat], y=data[col])
        plt.title(f'Violin Plot of {col} by {cat}')
        plt.xticks(rotation=45)
        plt.show()
        # Box Plot
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=data[cat], y=data[col])
        plt.title(f'Boxplot of {col} by {cat}')
        plt.xticks(rotation=45)
        plt.show()


#Multivariate Analysis
#Pair plots to analyze multiple relationships simultaneously.
print("\nPair Plot")
sns.pairplot(data[num_cols], diag_kind="kde")  # KDE for diagonal distribution
plt.show()

# Heatmaps to visualize correlations among multiple variables.
print("\nCorrelation Heatmap:")
plt.figure(figsize=(10, 6))
sns.heatmap(data[num_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Variables")
plt.show()

#Grouped comparisons to identify combined effects of multiple features.
print("\nGrouped Comparisons:")
for cat in cat_cols:
    for col in num_cols:
        plt.figure(figsize=(10, 5))
        sns.boxplot(x=data[cat], y=data[col], hue=data[cat])
        plt.title(f'Box Plot of {col} by {cat}')
        plt.xticks(rotation=45)
        plt.legend(title=cat, bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.show()