In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Exploratory Data Analysis (EDA)**

In [None]:
train_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
train_data.head()

In [None]:
train_data.info()
numerical_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train_data.select_dtypes(include=['object', 'category']).columns

In [None]:
duplicates = train_data.duplicated()
print("\nNumber of Duplicate Rows:", duplicates.sum())

# Display duplicate rows (if any)
if duplicates.sum() > 0:
    print("\nDuplicate Rows:\n", train_data[duplicates])


In [None]:
df = pd.DataFrame(train_data)

In [None]:
print("\n=== Analyse des colonnes catégoriques ===")
for col in ["KitchenQual", "ExterQual"]:
    print(f"\nColonne: {col}")
    print(df[col].value_counts())  # Compte des catégories

    # Visualisation: Barplot
    plt.figure(figsize=(6, 4))
    sns.countplot(x=col, data=df, palette='pastel')
    plt.title(f"Distribution de {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

**Correlation entre feature numerique et SalePrice**

In [None]:

plt.figure(figsize=(12, 10))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title("Matrice de corrélation")
plt.show()

In [None]:
feature = "SalePrice"  
correlations_with_feature = correlation_matrix[feature]

# Trier les corrélations par ordre décroissant
sorted_correlations = correlations_with_feature.sort_values(ascending=False)

# Afficher les résultats
print(f"Corrélations avec '{feature}':\n")
print(sorted_correlations)

# Filtrer les corrélations fortes (seuil de corrélation)
threshold = 0.5  
strong_correlations = sorted_correlations[abs(sorted_correlations) > threshold]
print(f"\nCorrélations fortes avec '{feature}' (|corr| > {threshold}):\n")
print(strong_correlations.keys())


* SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
* OverallQual: Overall material and finish quality
* YearBuilt: Original construction date
* YearRemodAdd: Remodel date
* TotalBsmtSF: Total square feet of basement area
* GrLivArea: Above grade (ground) living area square feet
* FullBath: Full bathrooms above grade
* 1stFlrSF: First Floor square feet
* GarageCars: Size of garage in car capacity
* GarageArea: Size of garage in square feet


In [None]:
threshold = 0.5  # Define a threshold for strong correlation
high_correlation_pairs = correlation_matrix.unstack().reset_index()
high_correlation_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]

high_correlation_pairs = high_correlation_pairs[
    (abs(high_correlation_pairs["Correlation"]) > threshold) & 
    (high_correlation_pairs["Feature 1"] != high_correlation_pairs["Feature 2"])
]
print("\nHigh Correlation Pairs:\n", high_correlation_pairs)

In [None]:
df[strong_correlations.keys()].isnull().sum()

In [None]:
# Analyse automatisée des colonnes numériques
print("=== Analyse des colonnes numériques ===")
for col in strong_correlations.keys():
    print(f"\nColonne: {col}")
    print(df[col].describe())  # Résumé statistique

    # Visualisation: Histogramme
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, bins=30, color='blue')
    plt.title(f"Distribution de {col}")
    plt.xlabel(col)
    plt.ylabel("Fréquence")
    plt.show()

**Correlation entre feature category et SalePrice**

In [None]:

df_encoded = pd.get_dummies(df, drop_first=True)
correlation_matrix = df_encoded.corr()

In [None]:
feature = "SalePrice" 
correlations_with_feature = correlation_matrix[feature]

# Trier les corrélations par ordre décroissant
sorted_correlations = correlations_with_feature.sort_values(ascending=False)

# Afficher les résultats
print(f"Corrélations avec '{feature}':\n")
print(sorted_correlations)

# Filtrer les corrélations fortes (seuil de corrélation)
threshold = 0.5  
strong_correlations = sorted_correlations[abs(sorted_correlations) > threshold]
print(f"\nCorrélations fortes avec '{feature}' (|corr| > {threshold}):\n")
print(strong_correlations.keys())

I will add KItchenQual and ExterQual

In [None]:
df_encoded[strong_correlations.keys()].isnull().sum()

In [None]:
df_encoded[strong_correlations.keys()].describe()

Maybe outlier on : N/A
(regarder si le std est proche de mean ou si un espacement plus important entre 75% 50% 25%)abs

#  Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

y = df_encoded[strong_correlations.keys()]['SalePrice']
#features = ['OverallQual', 'GrLivArea']
features = ['OverallQual', 'GrLivArea','GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt','YearRemodAdd']
X = df_encoded[strong_correlations.keys()][features]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
mae = mean_absolute_error(y_val, y_pred)          
r2 = r2_score(y_val, y_pred)   
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")

In [None]:
test_data = pd.read_csv('../input/home-data-for-ml-course/test.csv')  
df = pd.DataFrame(test_data)
df_encoded = pd.get_dummies(df, drop_first=True)
X_test = df_encoded[features]

# quelques missing data , remplacement par la median (surtout recommender en cas de data abérante)
X_test.fillna(X_test.median(), inplace=True)

X_test.isnull().sum()


In [None]:
test_preds = model.predict(X_test)

# Create a DataFrame for submission
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")