In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('boston.csv')

In [None]:
data.head()

In [None]:
data.shape

Data information:
CRIM: Crime Rate - Per capita crime rate by town
ZN: Residential Zone - Proportion of residential land zoned for large lots (over 25,000 sq. ft.)
INDUS: Business Zone - Proportion of non-retail business acres per town
CHAS: Charles River - Dummy variable (1 if tract bounds river, 0 otherwise)
NOX: Nitric Oxides - Concentration of nitric oxides (parts per 10 million)
RM: Rooms - Average number of rooms per dwelling
AGE: Age - Proportion of owner-occupied units built prior to 1940
DIS: Employment Distance - Weighted distances to five Boston employment center
RAD: Highway Accessibility - Index of accessibility to radial highways
TAX: Property Tax - Full-value property-tax rate per $10,000
PTRATIO: Pupil-Teacher Ratio - Pupil-teacher ratio by town
B: Black Proportion - Calculated based on the proportion of blacks by town
LSTAT: Lower Status - Percentage of lower status of the population
MEDV: Median Value - Median value of owner-occupied homes in $1000s

In [None]:
# Dictionary mapping original column names to new column names
column_names = {
    'CRIM': 'Crime Rate',
    'ZN': 'Residential Zone',
    'INDUS': 'Business Zone',
    'CHAS': 'Charles River',
    'NOX': 'Nitric Oxides',
    'RM': 'Rooms',
    'AGE': 'Age',
    'DIS': 'Employment Distance',
    'RAD': 'Highway Accessibility',
    'TAX': 'Property Tax',
    'PTRATIO': 'Pupil-Teacher Ratio',
    'B': 'Black Proportion',
    'LSTAT': 'Lower Status',
    'MEDV': 'Median Value'
}

# Rename the columns
data = data.rename(columns=column_names)

In [None]:
data.columns

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
# Plot histogram for Residential Zone
plt.figure(figsize=(8, 6))
plt.hist(data['Residential Zone'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Residential Zone')
plt.xlabel('Proportion of Residential Land Zoned for Lots Over 25,000 sq.ft.')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
discrete_feature = ['Residential Zone']
categorical_feature = ['Charles River','Highway Accessibility']
continuous_feature = [feature for feature in data.columns if (feature not in discrete_feature) and (feature not in categorical_feature)]

In [None]:
data[continuous_feature].describe()
#mean is moyenne

In [None]:
# Calculate number of rows and columns for subplots
num_rows = len(continuous_feature)
num_cols = 2  # One for histogram, one for box plot

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))

# Iterate through each continuous feature
for i, feature in enumerate(continuous_feature):
    # Plot histogram
    sns.histplot(data=data, x=feature, bins=20, kde=True, color='skyblue', edgecolor='black', ax=axes[i, 0])
    axes[i, 0].set_title(f'{feature} Distribution (Histogram)')
    axes[i, 0].set_xlabel(feature)
    axes[i, 0].set_ylabel('Frequency')

    # Plot box plot
    axes[i, 1].boxplot(data[feature], vert=False)
    axes[i, 1].set_title(f'{feature} Distribution (Box Plot)')
    axes[i, 1].set_xlabel(feature)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = data['Crime Rate'].quantile(0.25)
Q3 = data['Crime Rate'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the dataset to extract outliers
outliers = data[(data['Crime Rate'] < lower_bound) | (data['Crime Rate'] > upper_bound)]

# Sort outliers by the 'Crime Rate' column
outliers_sorted = outliers.sort_values(by='Crime Rate')

# Display outliers as a table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outliers_sorted.to_string(index=False))

In [None]:
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = data['Rooms'].quantile(0.25)
Q3 = data['Rooms'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the dataset to extract outliers
outliers = data[(data['Rooms'] < lower_bound) | (data['Rooms'] > upper_bound)]

# Sort outliers by the 'Crime Rate' column
outliers_sorted = outliers.sort_values(by='Rooms')

# Display outliers as a table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outliers_sorted.to_string(index=False))

In [None]:
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = data['Black Proportion'].quantile(0.25)
Q3 = data['Black Proportion'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the dataset to extract outliers
outliers = data[(data['Black Proportion'] < lower_bound) | (data['Black Proportion'] > upper_bound)]

# Sort outliers by the 'Crime Rate' column
outliers_sorted = outliers.sort_values(by='Black Proportion')

# Display outliers as a table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outliers_sorted.to_string(index=False))

In [None]:
# Define the number of rows and columns for subplots
num_rows = len(discrete_feature)
num_cols = 2  # One for histogram and one for box plot

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))

# If there's only one feature, adjust the axes array to be 2-dimensional
if num_rows == 1:
    axes = axes.reshape(1, -1)

# Iterate through each discrete feature
for i, feature in enumerate(discrete_feature):
    # Plot histogram
    sns.histplot(data=data, x=feature, bins='auto', color='skyblue', edgecolor='black', ax=axes[i, 0])
    axes[i, 0].set_title(f'{feature} Distribution (Histogram)')
    axes[i, 0].set_xlabel(feature)
    axes[i, 0].set_ylabel('Frequency')

    # Plot box plot
    sns.boxplot(data=data, x=feature, ax=axes[i, 1])
    axes[i, 1].set_title(f'{feature} Distribution (Box Plot)')
    axes[i, 1].set_xlabel(feature)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Define the number of rows and columns for subplots
num_rows = len(categorical_feature)
num_cols = 1

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(8, 5*num_rows))

# Iterate through each categorical feature
for i, feature in enumerate(categorical_feature):
    # Plot bar plot
    sns.countplot(data=data, x=feature, ax=axes[i])
    axes[i].set_title(f'{feature} Distribution')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Count')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
plt.subplot(121)
sns.histplot(data=data,x='Median Value',bins=30,kde=True,color='g')
plt.subplot(122)
sns.histplot(data=data,x='Median Value',bins=30,kde=True,hue='Charles River')
plt.show()

In [None]:
for i in continuous_feature+discrete_feature:
    # Calculate correlation coefficient
    correlation_coefficient = data['Median Value'].corr(data[i])

    # Create scatter plot
    plt.scatter(data['Median Value'], data[i])
    plt.title(f"Scatter Plot {i} (Correlation: {correlation_coefficient:.2f})")
    plt.xlabel('Median Value')
    plt.ylabel(i)

    plt.show()

In [None]:
# Set the style of seaborn
sns.set(style="whitegrid")

# Define the number of rows and columns for subplots
num_rows = len(categorical_feature)
num_cols = 1  # One for box plot

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 5*num_rows))

# Iterate through each categorical feature
for i, feature in enumerate(categorical_feature):
    # Plot box plot
    sns.boxplot(data=data, x=feature, y='Median Value', ax=axes[i])
    axes[i].set_title(f'Box Plot: {feature} vs Median Value')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Median Value')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Set the style of seaborn
sns.set(style="whitegrid")

# Define the number of rows and columns for subplots
num_rows = len(discrete_feature)
num_cols = 1  # One for box plot

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 5*num_rows), squeeze=False)

# Iterate through each discrete feature
for i, feature in enumerate(discrete_feature):
    # Calculate the row and column index for the current subplot
    row_index = i // num_cols
    col_index = i % num_cols

    # Plot box plot in the correct subplot
    sns.boxplot(data=data, x=feature, y='Median Value', ax=axes[row_index, col_index])
    axes[row_index, col_index].set_title(f'Box Plot: {feature} vs Median Value')
    axes[row_index, col_index].set_xlabel(feature)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
X = data.drop('Median Value', axis=1)
y = data['Median Value']

In [None]:
#Splitting tha dataset into Training set and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=41)

In [None]:
# Make sure that X_train and X_test are DataFrame
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
# Continue with your feature scaling setup
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# Identify the non-categorical features to scale (replace 'categorical_feature' with the actual feature not to scale)
features_to_scale = [col for col in X.columns if col not in categorical_feature]  # Adjust the name as needed

# Scale only the non-categorical features
X_train_scaled = sc.fit_transform(X_train[features_to_scale])
X_test_scaled = sc.transform(X_test[features_to_scale])

# Convert scaled arrays back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features_to_scale, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features_to_scale, index=X_test.index)

# Concatenate the non-scaled categorical feature back
X_train_final = pd.concat([X_train_scaled, X_train[categorical_feature]], axis=1)
X_test_final = pd.concat([X_test_scaled, X_test[categorical_feature]], axis=1)

# Outputs to see the final data frames
print("X_train_final head:", X_train_final.head())
print("X_test_final head:", X_test_final.head())

In [None]:
sc_y = StandardScaler()
y_train_scaled = sc_y.fit_transform(np.array(y_train).reshape(-1, 1))
y_test_scaled= sc_y.transform(np.array(y_test).reshape(-1, 1))

Create an Evaluate fonction to give all metrics after model Training

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
def evaluate_model(true,predicted):
  mae=mean_absolute_error(true,predicted)
  mse=mean_squared_error(true,predicted)
  rmse=np.sqrt(mean_squared_error(true,predicted))
  r2_square= r2_score(true,predicted)
  median_ae = median_absolute_error(true, predicted)
  return mae, mse, rmse, r2_square, median_ae

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Dictionary of regression models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Elastic Net": ElasticNet(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Machine": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
  model=list(models.values())[i]
  model.fit(X_train_final,y_train_scaled)  #train model

  #make predictions
  y_train_pred = sc_y.inverse_transform(model.predict(X_train_final).reshape(-1, 1))
  y_test_pred = sc_y.inverse_transform(model.predict(X_test_final).reshape(-1, 1))

  #Evaluate Train and Test dataset
  model_train_mae , model_train_mse , model_train_rmse , model_train_r2 , model_train_median_ae = evaluate_model(y_train,y_train_pred)
  model_test_mae , model_test_mse , model_test_rmse , model_test_r2 , model_test_median_ae = evaluate_model(y_test,y_test_pred)

  print(list(models.keys())[i])
  model_list.append(list(models.keys())[i])

  print("Model performance for training set")
  print("- Root Mean Squared Error : {:.4f}".format(model_train_rmse))
  print("- Mean Absolute Error : {:.4f}".format(model_train_mae))
  print("- R2 Score : {:.4f}".format(model_train_r2))
  print("- Median Absolute Error : {:.4f}".format(model_train_median_ae))


  print('--------------------------------------------------------------------------------------')

  print("Model performance for test set")
  print("- Root Mean Squared Error : {:.4f}".format(model_test_rmse))
  print("- Mean Absolute Error : {:.4f}".format(model_test_mae))
  print("- R2 Score : {:.4f}".format(model_test_r2))
  print("- Median Absolute Error : {:.4f}".format(model_test_median_ae))

  r2_list.append(model_test_r2)

  print("\n")