In [None]:
%%time
!pip install pandas matplotlib seaborn numpy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline
plt.style.use('seaborn-v0_8')

# Create output directory for figures
os.makedirs('notebook_output', exist_ok=True)

In [None]:

# Load and preview the df
data = pd.read_csv('temp_csv/data.csv')
print("data shape:", data.shape)
print("\nFirst few rows:")
print(data.head())
print("Columns:", data.columns.tolist())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set the style for seaborn
sns.set(style='whitegrid')

# Check the first few rows of the dataframe
data.head()

# Descriptive statistics
descriptive_stats = data.describe()
print(descriptive_stats)

# Correlation matrix
correlation_matrix = data.corr()
print(correlation_matrix)

# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.savefig('notebook_output/data/correlation_matrix_heatmap.png')
plt.close()

# Distribution plots for each feature
for column in data.columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[column], kde=True, bins=30)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.savefig(f'data/distribution_{column.strip().replace(" ", "_")}.png')
    plt.close()

# Boxplots to check for outliers
for column in data.columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=data[column])
    plt.title(f'Boxplot of {column}')
    plt.xlabel(column)
    plt.savefig(f'data/boxplot_{column.strip().replace(" ", "_")}.png')
    plt.close()

# Pairplot to visualize relationships between features
sns.pairplot(data, hue='Bankrupt?', diag_kind='kde')
plt.savefig('notebook_output/data/pairplot.png')
plt.close()

# Countplot for the target variable
plt.figure(figsize=(6, 4))
sns.countplot(x='Bankrupt?', data=data)
plt.title('Count of Bankrupt vs Non-Bankrupt Companies')
plt.xlabel('Bankrupt?')
plt.ylabel('Count')
plt.savefig('notebook_output/data/countplot_bankrupt.png')
plt.close()

# Violin plots to see the distribution of features with respect to the target variable
for column in data.columns[1:]:  # Skip the 'Bankrupt?' column
    plt.figure(figsize=(8, 4))
    sns.violinplot(x='Bankrupt?', y=column, data=data)
    plt.title(f'Violin plot of {column} by Bankrupt Status')
    plt.xlabel('Bankrupt?')
    plt.ylabel(column)
    plt.savefig(f'data/violinplot_{column.strip().replace(" ", "_")}.png')
    plt.close()