In [56]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae


In [57]:
url = '/Users/miriamg/Documents/Mid-Term-Project_ML_Ironhack/Mid-term project/Cleaning/data_for_revenue_pred.csv'
airbnb_df = pd.read_csv(url)
airbnb_df.columns

Index(['Unnamed: 0', 'id', 'superhost', 'neighbourhood', 'room_type',
       'accommodates', 'bathrooms', 'beds', 'price', 'availability_365',
       'number_of_reviews', 'review_scores_rating'],
      dtype='object')

In [58]:
airbnb_df['days_rented'] = [365 - a for a in airbnb_df['availability_365']]


In [59]:
#Adding column of revenue which is our target variable

airbnb_df['revenue'] = airbnb_df['days_rented']* airbnb_df['price']
airbnb_df.drop(columns=['availability_365', 'Unnamed: 0', 'id'], inplace= True)
airbnb_df

Unnamed: 0,superhost,neighbourhood,room_type,accommodates,bathrooms,beds,price,number_of_reviews,review_scores_rating,days_rented,revenue
0,f,Cuajimalpa de Morelos,Entire home/apt,2,1.0,1,3390.0,0,0.00,2,6780.0
1,f,Cuauhtémoc,Entire home/apt,14,5.5,8,18000.0,64,4.58,0,0.0
2,t,Cuauhtémoc,Entire home/apt,2,1.5,1,3350.0,221,4.90,276,924600.0
3,f,Gustavo A. Madero,Private room,2,1.0,1,339.0,6,4.83,279,94581.0
4,f,Cuauhtémoc,Entire home/apt,2,1.5,2,1273.0,22,4.67,36,45828.0
...,...,...,...,...,...,...,...,...,...,...,...
25705,f,Benito Juárez,Entire home/apt,2,1.0,1,970.0,0,0.00,100,97000.0
25706,f,Cuauhtémoc,Private room,2,1.0,1,175.0,0,0.00,103,18025.0
25707,f,Cuauhtémoc,Entire home/apt,5,2.0,3,1166.0,0,0.00,99,115434.0
25708,f,Cuauhtémoc,Private room,2,1.0,1,286.0,0,0.00,98,28028.0


In [92]:
categorical_df = airbnb_df.select_dtypes(include = 'object')
categorical_df.nunique()

superhost         2
neighbourhood    16
room_type         4
dtype: int64

In [93]:
categorical_nominals_df = pd.get_dummies(categorical_df, dtype= int)

categorical_nominals_df

Unnamed: 0,superhost_f,superhost_t,neighbourhood_Azcapotzalco,neighbourhood_Benito Juárez,neighbourhood_Coyoacán,neighbourhood_Cuajimalpa de Morelos,neighbourhood_Cuauhtémoc,neighbourhood_Gustavo A. Madero,neighbourhood_Iztacalco,neighbourhood_Iztapalapa,...,neighbourhood_Milpa Alta,neighbourhood_Tlalpan,neighbourhood_Tláhuac,neighbourhood_Venustiano Carranza,neighbourhood_Xochimilco,neighbourhood_Álvaro Obregón,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25705,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
25706,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
25707,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
25708,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [94]:
numerical_df = airbnb_df.select_dtypes(include ='number')
numerical_df.drop(columns= ['number_of_reviews',
       'review_scores_rating'], inplace= True)

numerical_df


Unnamed: 0,accommodates,bathrooms,beds,price,days_rented,revenue
0,2,1.0,1,3390.0,2,6780.0
1,14,5.5,8,18000.0,0,0.0
2,2,1.5,1,3350.0,276,924600.0
3,2,1.0,1,339.0,279,94581.0
4,2,1.5,2,1273.0,36,45828.0
...,...,...,...,...,...,...
25705,2,1.0,1,970.0,100,97000.0
25706,2,1.0,1,175.0,103,18025.0
25707,5,2.0,3,1166.0,99,115434.0
25708,2,1.0,1,286.0,98,28028.0


In [95]:
complete_df = pd.concat([categorical_nominals_df, numerical_df],axis=1)

n_columns = complete_df.columns
complete_df


Unnamed: 0,superhost_f,superhost_t,neighbourhood_Azcapotzalco,neighbourhood_Benito Juárez,neighbourhood_Coyoacán,neighbourhood_Cuajimalpa de Morelos,neighbourhood_Cuauhtémoc,neighbourhood_Gustavo A. Madero,neighbourhood_Iztacalco,neighbourhood_Iztapalapa,...,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,accommodates,bathrooms,beds,price,days_rented,revenue
0,1,0,0,0,0,1,0,0,0,0,...,1,0,0,0,2,1.0,1,3390.0,2,6780.0
1,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,14,5.5,8,18000.0,0,0.0
2,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,2,1.5,1,3350.0,276,924600.0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,1,0,2,1.0,1,339.0,279,94581.0
4,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,2,1.5,2,1273.0,36,45828.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25705,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,2,1.0,1,970.0,100,97000.0
25706,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,2,1.0,1,175.0,103,18025.0
25707,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,5,2.0,3,1166.0,99,115434.0
25708,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,2,1.0,1,286.0,98,28028.0


 ## X-y split

In [96]:
X = complete_df.drop(columns=['revenue'])
y = complete_df[['revenue']]
y

Unnamed: 0,revenue
0,6780.0
1,0.0
2,924600.0
3,94581.0
4,45828.0
...,...
25705,97000.0
25706,18025.0
25707,115434.0
25708,28028.0


## Train-test split

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, random_state=42)
X_train

Unnamed: 0,superhost_f,superhost_t,neighbourhood_Azcapotzalco,neighbourhood_Benito Juárez,neighbourhood_Coyoacán,neighbourhood_Cuajimalpa de Morelos,neighbourhood_Cuauhtémoc,neighbourhood_Gustavo A. Madero,neighbourhood_Iztacalco,neighbourhood_Iztapalapa,...,neighbourhood_Xochimilco,neighbourhood_Álvaro Obregón,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,accommodates,bathrooms,beds,price
15832,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,2,1.0,1,690.0
9922,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1.0,1,500.0
20431,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,4,1.5,2,1414.0
19898,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,4,2.0,2,1086.0
17172,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,2,1.5,1,1623.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,1.0,1,678.0
5390,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,7,3.5,4,2360.0
860,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,2,1.0,1,650.0
15795,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,4,1.0,2,1900.0


## Standardize the data. PowerTransformer

In [97]:
scaler = PowerTransformer()

In [98]:
scaler_fitted = scaler.fit(X_train)
X_train= scaler_fitted.transform(X_train)
X_train

array([[-1.24222722,  1.24222722, -0.10643809, ..., -0.64572901,
        -0.86142475, -0.43376978],
       [-1.24222722,  1.24222722, -0.10643809, ..., -0.64572901,
        -0.86142475, -0.89168834],
       [-1.24222722,  1.24222722, -0.10643809, ...,  0.4346487 ,
         0.48862648,  0.47706134],
       ...,
       [ 0.80500571, -0.80500571, -0.10643809, ..., -0.64572901,
        -0.86142475, -0.51625281],
       [-1.24222722,  1.24222722, -0.10643809, ..., -0.64572901,
         0.48862648,  0.81225601],
       [ 0.80500571, -0.80500571, -0.10643809, ...,  1.50013464,
         0.48862648,  0.96581501]])

In [99]:
X_test = scaler_fitted.transform(X_test)
X_test

array([[-1.24222722,  1.24222722, -0.10643809, ..., -0.64572901,
         0.48862648,  0.54580654],
       [-1.24222722,  1.24222722, -0.10643809, ..., -0.64572901,
         1.14045002,  0.44846129],
       [ 0.80500571, -0.80500571, -0.10643809, ..., -0.64572901,
        -0.86142475, -1.66021753],
       ...,
       [ 0.80500571, -0.80500571, -0.10643809, ..., -0.64572901,
        -0.86142475,  0.44074652],
       [ 0.80500571, -0.80500571, -0.10643809, ..., -0.64572901,
         0.48862648, -0.15560663],
       [ 0.80500571, -0.80500571, -0.10643809, ..., -0.64572901,
         0.48862648,  0.75191435]])

In [100]:
scaler_fitted_y= scaler.fit(y_train)
y_train= scaler_fitted_y.transform(y_train)
y_train

array([[ 0.64424144],
       [-1.3813721 ],
       [ 1.05053171],
       ...,
       [-0.86531656],
       [ 1.2973203 ],
       [-0.59437681]])

In [101]:
y_test = scaler_fitted_y.transform(y_test)
y_test

array([[ 0.12869557],
       [ 1.15502275],
       [-1.2155945 ],
       ...,
       [ 1.15625695],
       [-0.94672937],
       [-2.12878999]])

## Applying linear regression

In [102]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)

In [103]:
model.coef_

array([[8.03489478e+12, 8.03489478e+12, 9.49642756e+10, 2.98927462e+11,
        2.25362597e+11, 1.21131636e+11, 4.48597888e+11, 1.19950202e+11,
        1.03275503e+11, 9.47466805e+10, 6.44880805e+10, 3.45805418e+11,
        3.18137455e+10, 1.52827364e+11, 3.89515802e+10, 1.41858733e+11,
        6.73398011e+10, 1.79291131e+11, 6.02110556e+11, 7.32245383e+10,
        5.95451846e+11, 1.39484109e+11, 3.28369141e-02, 5.29174805e-02,
        3.86962891e-02, 2.59643555e-01]])

In [104]:
model.intercept_

array([0.00380766])

## Create predictions for the test set

In [108]:
y_pred = model.predict(X_test)
y_pred 

array([[ 0.08178924],
       [ 0.3198965 ],
       [-0.5866897 ],
       ...,
       [-0.04114034],
       [-0.54315168],
       [ 0.07892416]])

In [109]:
y_pred = pd.DataFrame(y_pred)
y_pred

Unnamed: 0,0
0,0.081789
1,0.319897
2,-0.586690
3,-0.344352
4,-0.643136
...,...
6423,0.081657
6424,0.168145
6425,-0.041140
6426,-0.543152


In [110]:
y_test = pd.DataFrame(y_test)
y_test = y_test.reset_index(drop = True)
y_test

Unnamed: 0,0
0,0.128696
1,1.155023
2,-1.215594
3,0.243943
4,0.125083
...,...
6423,-0.925689
6424,0.656828
6425,1.156257
6426,-0.946729


In [111]:
residuals_df = pd.concat([y_test, y_pred], axis=1)
residuals_df.columns = ["y_test", "y_pred"]
residuals_df

Unnamed: 0,y_test,y_pred
0,0.128696,0.081789
1,1.155023,0.319897
2,-1.215594,-0.586690
3,0.243943,-0.344352
4,0.125083,-0.643136
...,...,...
6423,-0.925689,0.081657
6424,0.656828,0.168145
6425,1.156257,-0.041140
6426,-0.946729,-0.543152


In [112]:
residuals_df["residual"] = residuals_df["y_test"]- residuals_df["y_pred"]

In [113]:
residuals_df.head()

Unnamed: 0,y_test,y_pred,residual
0,0.128696,0.081789,0.046906
1,1.155023,0.319897,0.835126
2,-1.215594,-0.58669,-0.628905
3,0.243943,-0.344352,0.588295
4,0.125083,-0.643136,0.76822


## Calculating metrics

In [114]:

mean_error = residuals_df["residual"].mean()
mean_error

-0.002583512046298568

In [115]:
mse(y_test, y_pred)

0.8390896977238224

In [116]:
mae(y_test, y_pred)

0.7501388678263806

In [117]:
rmse= mse( y_test, y_pred, squared = False)
rmse

0.9160183937693731

## Computing R2_score for test_set

In [118]:
round(r2_score(y_test, y_pred),2)

0.17

## Computing R2_score for train_set

In [119]:
y_pred_train = pd.DataFrame(model.predict(X_train))

In [120]:
round(r2_score(y_train, y_pred_train),2)

0.16

## Feature importance plot

In [121]:
feature_names = list(X.columns) #Why the names of the columns are transformed into numbers?
feature_names 

['superhost_f',
 'superhost_t',
 'neighbourhood_Azcapotzalco',
 'neighbourhood_Benito Juárez',
 'neighbourhood_Coyoacán',
 'neighbourhood_Cuajimalpa de Morelos',
 'neighbourhood_Cuauhtémoc',
 'neighbourhood_Gustavo A. Madero',
 'neighbourhood_Iztacalco',
 'neighbourhood_Iztapalapa',
 'neighbourhood_La Magdalena Contreras',
 'neighbourhood_Miguel Hidalgo',
 'neighbourhood_Milpa Alta',
 'neighbourhood_Tlalpan',
 'neighbourhood_Tláhuac',
 'neighbourhood_Venustiano Carranza',
 'neighbourhood_Xochimilco',
 'neighbourhood_Álvaro Obregón',
 'room_type_Entire home/apt',
 'room_type_Hotel room',
 'room_type_Private room',
 'room_type_Shared room',
 'accommodates',
 'bathrooms',
 'beds',
 'price',
 'days_rented']

In [124]:
coefficients = model.coef_
coefficients = [i for i in coefficients[0]]

fi = {"name": feature_names, "coeff": coefficients}
fi_df = pd.DataFrame(fi)

fi_df

ValueError: All arrays must be of the same length

In [125]:
import seaborn as sns
import matplotlib.pyplot as plt

fi_df = fi_df.sort_values(by='coeff', ascending=False)
plt.figure(figsize=(10,12))

sns.barplot(y= fi_df['name'], x = fi_df['coeff'])

NameError: name 'fi_df' is not defined