# Seperate Columns

In [None]:
import pandas as pd
from google.colab import files

file_path = 'RawDataset.csv'
data = pd.read_csv(file_path, delimiter=';')

output_path = 'CleanedDataset.csv'
data.to_csv(output_path, index=False)

print(data.head())
files.download(output_path)

# Find Null Values

In [None]:
import pandas as pd

data = pd.read_csv('CleanedDataset.csv')

# Check for null values 
nullValues = data.isnull().sum()
# Display columns with null values
print("Null values in each column:")
print(nullValues)

# Feature importance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

file_path = 'CleanedDataset.csv'
data = pd.read_csv(file_path)

# Do a simple encoding to train a random forest
label_encoders = {}
data_encoded = data.copy()
for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_encoded[col] = le.fit_transform(data[col])
    label_encoders[col] = le

X = data_encoded.drop(columns=['y'])
y = data_encoded['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple model to get feature importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
# Get feature importances
importances = rf.feature_importances_
features = X.columns
# Sort features 
indices = np.argsort(importances)[::-1]
features_sorted = features[indices]
importances_sorted = importances[indices]

# Display
plt.figure(figsize=(10, 8))
plt.barh(features_sorted, importances_sorted, color='purple')
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Feature Importance for Predicting y', fontsize=14)
plt.gca().invert_yaxis()
plt.show()