# Downloading The Data and Importing the packages

## Imports

In [1]:
# Import the kagglehub library
import kagglehub
from google.colab import userdata
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shutil

## Downloading The Data

In [2]:
# Set Kaggle environment variables
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

# File path and target directory
file_path = "playground-series-s4e12.zip"
unzip_dir = "data"

# Check if the data directory exists and create it if necessary
if not os.path.exists(unzip_dir):
    os.makedirs(unzip_dir)

# Check if the zip file exists
if os.path.exists(file_path):
    print("Zip file already exists.")
else:
  if not os.listdir(unzip_dir):  # List directory contents
    # Download the file from Kaggle
    print("Downloading zip file from Kaggle...")
    !kaggle competitions download -c playground-series-s4e12

# Check if the data folder is empty
if not os.listdir(unzip_dir):  # List directory contents
    print("Data folder is empty. Unzipping files...")
    !unzip -o "$file_path" -d "$unzip_dir"
else:
    print("Data folder is not empty. Skipping unzip operation.")

# Check if the file exists and remove it
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_path} has been removed.")
else:
    print(f"{file_path} does not exist.")

Downloading zip file from Kaggle...
Downloading playground-series-s4e12.zip to /content
 99% 75.0M/75.5M [00:03<00:00, 28.3MB/s]
100% 75.5M/75.5M [00:03<00:00, 20.0MB/s]
Data folder is empty. Unzipping files...
Archive:  playground-series-s4e12.zip
  inflating: data/sample_submission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          
playground-series-s4e12.zip has been removed.


# Perform EDA (Exploratory Data Analysis) on train data

## Only if not availible on github otherwise download the eda folder from github

## Download the EDA data from Github if availible

In [3]:
# Step 1: Download the repository as a zip file
!wget https://github.com/Hero988/playground-series-s4e12-competition-results/archive/refs/heads/main.zip

# Step 2: Unzip the file
!unzip main.zip

# Step 3: Move the desired folder to a specific location
!mv playground-series-s4e12-competition-results-main/eda_train_initial /content/eda_train_initial

# Step 4: Clean up by removing the unneeded files
!rm -r playground-series-s4e12-competition-results-main main.zip

--2024-12-14 02:01:39--  https://github.com/Hero988/playground-series-s4e12-competition-results/archive/refs/heads/main.zip
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/Hero988/playground-series-s4e12-competition-results/zip/refs/heads/main [following]
--2024-12-14 02:01:39--  https://codeload.github.com/Hero988/playground-series-s4e12-competition-results/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 20.27.177.114
Connecting to codeload.github.com (codeload.github.com)|20.27.177.114|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [ <=>                ] 531.81K  2.99MB/s    in 0.2s    

2024-12-14 02:01:40 (2.99 MB/s) - ‘main.zip’ saved [544575]

Archive:  main.zip
6a48cb1f087dbc89500ec3554f6163

## Load the train dataset

In [4]:
data_path = "/content/data/train.csv"
df_train = pd.read_csv(data_path)

categorical_cols = df_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_train.select_dtypes(include=['number']).columns.tolist()

## Create a folder to save the EDA outputs and plots

In [5]:
if not os.path.exists("/content/eda_train_initial"):
  output_folder = "/content/eda"
  os.makedirs(output_folder, exist_ok=True)

## Basic Information

In [6]:
if not os.path.exists("/content/eda_train_initial"):
  with open(os.path.join(output_folder, "basic_info.txt"), "w") as f:
    f.write("Basic Information:\n")
    df_train.info(buf=f)
    f.write("\n\nFirst 5 rows:\n")
    f.write(df_train.head().to_string())
    f.write(f"\n\nDataset shape: {df_train.shape}\n")

## Missing Values

In [7]:
if not os.path.exists("/content/eda_train_initial"):
  missing_values = df_train.isnull().sum()
  missing_values.to_csv(os.path.join(output_folder, "missing_values.csv"))

## Visualize missing values

In [8]:
if not os.path.exists("/content/eda_train_initial"):
  plt.figure(figsize=(12, 6))
  sns.heatmap(df_train.isnull(), cbar=False, cmap='viridis', yticklabels=False)
  plt.title("Missing Values Heatmap", fontsize=16)
  plt.savefig(os.path.join(output_folder, "missing_values_heatmap.png"))
  plt.close()

## Summary statistics

In [9]:
if not os.path.exists("/content/eda_train_initial"):
  df_train.describe(include="all").to_csv(os.path.join(output_folder, "summary_statistics.csv"))

## Categorical vs Numerical Features

In [10]:
if not os.path.exists("/content/eda_train_initial"):
  with open(os.path.join(output_folder, "feature_types.txt"), "w") as f:
    f.write(f"Categorical Columns: {categorical_cols}\n")
    f.write(f"Numerical Columns: {numerical_cols}\n")

## Distribution of Numerical Features

In [11]:
if not os.path.exists("/content/eda_train_initial"):
  # Define the parent folder and subfolder paths
  parent_folder = "/content/eda"
  subfolder = os.path.join(parent_folder, "numerical_distributions")

  # Create the subfolder inside the parent folder
  os.makedirs(subfolder, exist_ok=True)

  # Save histograms of numerical columns in the subfolder
  for col in numerical_cols:
    plt.figure(figsize=(10, 5))
    sns.histplot(df_train[col], kde=True, bins=30, color='blue')
    plt.title(f"Distribution of {col}", fontsize=14)
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.savefig(os.path.join(subfolder, f"distribution_{col}.png"))  # Save images in the subfolder
    plt.close()

## Correlation Matrix for Numerical Features

In [12]:
if not os.path.exists("/content/eda_train_initial"):
  plt.figure(figsize=(12, 8))
  correlation_matrix = df_train[numerical_cols].corr()
  sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar=True)
  plt.title("Correlation Matrix of Numerical Features", fontsize=16)
  plt.savefig(os.path.join(parent_folder, "correlation_matrix.png"))
  plt.close()

## Pair Plot for Numerical Features

In [13]:
if not os.path.exists("/content/eda_train_initial"):
  if len(numerical_cols) <= 5:  # Pair plots can be overwhelming for too many features
    pairplot = sns.pairplot(df_train[numerical_cols])
    pairplot.fig.suptitle("Pair Plot of Numerical Features", y=1.02, fontsize=16)
    pairplot.savefig(os.path.join(output_folder, "pairplot_numerical.png"))
    plt.close()

## Outliers Detection (Boxplot)

In [14]:
if not os.path.exists("/content/eda_train_initial"):
  # Define the parent folder and subfolder paths
  parent_folder = "/content/eda"
  subfolder_3 = os.path.join(parent_folder, "outliers_detection")

  # Create the subfolder inside the parent folder
  os.makedirs(subfolder_3, exist_ok=True)

  # Save boxplots for numerical columns
  for col in numerical_cols:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df_train[col])  # Removed `palette` parameter
    plt.title(f"Outliers in {col}", fontsize=14)
    plt.xlabel(col)
    plt.grid(True)
    plt.savefig(os.path.join(subfolder_3, f"boxplot_{col}.png"))
    plt.close()

## Downloading the newly created eda folder

In [15]:
if not os.path.exists("/content/eda_train_initial"):
  # Specify the folder path you want to download
  folder_to_download = "/content/eda"

  # Create a zip file
  shutil.make_archive("eda", "zip", folder_to_download)