In [None]:
import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [None]:
data = pd.read_csv('../data/train.csv')

print("\nSample Data:")
print(data.head())
print("\nDescriptive Statistics:")
print(data.describe())
print("\nDataset Info:")
data.info()

In [None]:
print("\nColumns with Missing Values:")
print(data.isnull().sum())

print("\nRows with duplicated values:", data.duplicated().sum())

In [None]:
data.drop(columns=["Id"], inplace=True)

In [None]:
print("\nDistribution of numerical variables:")
data.select_dtypes(include=np.number).hist(bins=20, figsize=(15, 10))
plt.suptitle('Distribution of numerical variables')
plt.show()

In [None]:
print("\nDistribution of categorical variables:")
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    plt.figure(figsize=(12, 4))
    sns.countplot(data=data, x=col)
    plt.title(f'Distribution of categorical variable: {col}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
print("\nBox plots for numeric variables:")
numerical_cols = data.select_dtypes(include=np.number).columns
for col in numerical_cols:
    plt.figure(figsize=(12, 4))
    sns.boxplot(x=data[col])
    plt.title(f'Box plot for {col}')
    plt.show()

In [None]:
encoder = LabelEncoder()
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = encoder.fit_transform(data[column])

In [None]:
print("\nCorrelation matrix:")
correlation_matrix = data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation matrix')
plt.show()

In [None]:
X = data.drop(columns=['Status'])
y = data['Status']

model = RandomForestRegressor(random_state=42)
model.fit(X, y)

features_importance = pd.Series(model.feature_importances_, index=X.columns)
features_importance = features_importance.sort_values(ascending=False)

print("Features Importance in model:")
for feature, importance in features_importance.items():
    print(f"{feature}: {importance:.4f}")

In [None]:
print("\nCorrelation matrix:")
correlation_matrix = data_encoded.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation matrix')
plt.show()

In [None]:
profile = ProfileReport(data, title="House Prices")
profile.to_notebook_iframe()
profile.to_file("..reports/profile_report.html")