# Dataset Analysis

In [1]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Load the train and test datasets
new_train_data = pd.read_csv("train.csv")
new_test_data = pd.read_csv("test.csv")

# Check basic structure of the datasets
print("Train Dataset Overview:\n")
print(new_train_data.info())
print("\nTest Dataset Overview:\n")
print(new_test_data.info())

# Display summary statistics for numerical columns
print("\nTrain Dataset Numerical Summary:\n", new_train_data.describe())
print("\nTest Dataset Numerical Summary:\n", new_test_data.describe())

# Display first few rows of the datasets
print("\nTrain Dataset Sample Rows:\n", new_train_data.head())
print("\nTest Dataset Sample Rows:\n", new_test_data.head())
    

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## Missing Values Analysis

In [None]:

# Calculate missing value percentages
train_missing = new_train_data.isnull().mean() * 100
test_missing = new_test_data.isnull().mean() * 100

print("Train Dataset Missing Values (%):\n", train_missing[train_missing > 0].sort_values(ascending=False))
print("\nTest Dataset Missing Values (%):\n", test_missing[test_missing > 0].sort_values(ascending=False))
    

## Unique Value Analysis

In [None]:

# Display unique values in each column
for col in new_train_data.columns:
    print(f"Column: {col}")
    print(f"Unique Values: {new_train_data[col].nunique()}")
    print("-" * 50)
    

## Data Visualization

In [None]:

# Visualizing the target variable distribution
plt.figure(figsize=(8, 5))
sns.histplot(new_train_data['matched_score'], kde=True, color='blue')
plt.title('Distribution of Matched Score')
plt.xlabel('Matched Score')
plt.ylabel('Frequency')
plt.show()

# Analyzing the impact of skills_required on matched_score
plt.figure(figsize=(10, 6))
sns.boxplot(x='skills_required', y='matched_score', data=new_train_data)
plt.title('Skills Required vs Matched Score')
plt.xticks(rotation=45)
plt.show()

# Correlation matrix for numerical columns
numerical_cols = new_train_data.select_dtypes(include=['float64', 'int64']).columns
corr_matrix = new_train_data[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
    

## Handle Missing Values

In [None]:

# Filling missing values with a placeholder or strategy
new_train_data.fillna("Unknown", inplace=True)
new_test_data.fillna("Unknown", inplace=True)

print("Missing values handled successfully.")
    

## Encoding Categorical Features

In [None]:

# Encode categorical columns
categorical_cols = new_train_data.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()

for col in categorical_cols:
    new_train_data[col] = label_encoder.fit_transform(new_train_data[col])
    new_test_data[col] = label_encoder.transform(new_test_data[col])

print("Categorical columns encoded successfully.")
    

## Prepare Data for Modeling

In [None]:

# Drop columns not useful for modeling
X = new_train_data.drop(columns=['matched_score', 'address'])
y = new_train_data['matched_score']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and validation sets successfully.")
    

## Feature Importance Using Random Forest

In [None]:

# Train Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Plot feature importance
importance = rf.feature_importances_
indices = np.argsort(importance)[::-1]
features = X_train.columns

plt.figure(figsize=(12, 8))
sns.barplot(x=importance[indices], y=features[indices], palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()
    

## Save Cleaned Data

In [None]:

# Save the processed data for reuse
new_train_data.to_csv("cleaned_train_data.csv", index=False)
new_test_data.to_csv("cleaned_test_data.csv", index=False)

print("Cleaned data saved successfully.")
    