<a href="https://colab.research.google.com/github/MohammedMohsen0404/Projects_List/blob/main/Proj7_Airbnb_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
**<center><h1>Airbnb Cleaned Europe Prediction</h1></center>**
<center><h3>Learning ML, DL through 100 Practical Projects</h3></center>

---

#**Import Libraries and Data**
---

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm, skew
import warnings
warnings.filterwarnings('ignore')

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [10]:
D = pd.read_csv("Aemf1.csv")
data = D.copy()

FileNotFoundError: [Errno 2] No such file or directory: 'Aemf1.csv'

# **Take a look at the data**
---

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.select_dtypes(include=['object']).describe()

In [None]:
data['Price'].describe()

# **Exploratory Data Analysis**
---

**Univariate Analysis**

In [None]:
numerical_data = data.select_dtypes(include='number')
numerical_data.hist(figsize=(10, 8),color = 'b')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.boxplot(numerical_data)
plt.show()

In [None]:
categorical_data = data.select_dtypes(include='object')
for column in categorical_data.columns:
    sns.countplot(data=categorical_data, x=column, palette="Set1")
    plt.title(f"Countplot of {column}")
    plt.show()

**Multivariate Analysis**

In [None]:
sns.pairplot(data.select_dtypes(include='number'))
plt.show()

In [None]:
sns.heatmap(numerical_data.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
sns.distplot(data['Price'],fit = stats.norm)
fig = plt.figure()
res = stats.probplot(data['Price'], plot=plt)

# **Data Cleaning**
---

**Handling Duplicate Rows**

In [None]:
# Check for duplicate rows
duplicate_rows = data.duplicated()
# Count of duplicate rows
print(f"Number of duplicate rows: {duplicate_rows.sum()}")

**Handling Missing Data**

In [None]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
total = data.isnull().sum().sum()
print('Total Null values =' ,total)

# **Data Preprocessing**
---

**Formatting**

In [None]:
data['Shared Room'] = data['Shared Room'].astype(int)
data['Private Room'] = data['Private Room'].astype(int)
data['Superhost'] = data['Superhost'].astype(int)

**Dealing with Outliers**

In [None]:
data['Price']=np.log1p(data['Price'])
data['Price']=np.sqrt(data['Price'])

In [None]:
col_outlier = ['Price', 'City Center (km)', 'Metro Distance (km)',
               'Attraction Index', 'Normalised Attraction Index',
               'Restraunt Index', 'Normalised Restraunt Index']

In [None]:
for col in col_outlier:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)

    IQR = Q3 - Q1

    limit_low = Q1 - IQR * 1.7
    limit_high = Q3 + IQR * 1.7

    data = data[(data[col] >= limit_low) & (data[col] <= limit_high)]


In [None]:
sns.distplot(data['Price'],fit = stats.norm)
fig = plt.figure()
res = stats.probplot(data['Price'], plot=plt)

**Encoding Categorical Variables**

In [None]:
list_str = data.select_dtypes(include = 'object').columns
le = LabelEncoder()

for c in list_str:
    data[c] = le.fit_transform(data[c])

In [None]:
sns.heatmap(data.corr())

**Data splitting**

In [None]:
X = data.drop(['Price','Shared Room','Private Room',
             'Superhost','Attraction Index','Restraunt Index', 'Day',
             'Cleanliness Rating'], axis = 1)
y = data['Price']

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.30, random_state = 11)

# **Modeling**

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as XGB


In [None]:
regressors = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge(random_state=42)),
    ('Lasso Regression', Lasso(random_state=42)),
    ('Decision Tree Regressor', DecisionTreeRegressor(random_state=42)),
    ('Random Forest Regressor', RandomForestRegressor(random_state=42)),
    ('K-Nearest Neighbors Regressor', KNeighborsRegressor()),
    ('Gradient Boosting Regressor', GradientBoostingRegressor(random_state=42)),
    ('XGBRegressor',XGB.XGBRegressor(random_state=42))
]

In [None]:
for clf_name, clf in regressors:
    clf.fit(xtrain, ytrain)
    y_pred = clf.predict(xtest)
    print(f'{clf_name}:')
    print('R2 Score: ', r2_score(ytest, y_pred))
    print("MSE: ", mean_squared_error(ytest, y_pred))
    print("MAE: ", mean_absolute_error(ytest, y_pred))
    print('------------------------------------')
