In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
listings = pd.read_csv('data/listings.csv')
reviews1 = pd.read_csv('data/reviews1.csv')
reviews2 = pd.read_csv('data/reviews2.csv')
reviews3 = pd.read_csv('data/reviews3.csv')
reviews4 = pd.read_csv('data/reviews4.csv')

# 5.2 Preprocessing

Firts we are concatenating the reviews files in one dataframe

In [3]:
reviews = pd.concat([reviews1, reviews2, reviews3, reviews4])

What is the new shape of reviews dataframe?

In [4]:
reviews.shape[0]

406607

In [5]:
reviews.shape[1]

6

Reviews dataframe contains 406606 rows and 6 columns

Now we will check how many NaN values this dataframe has and how many in each column

In [6]:
print(reviews.isnull().any(axis=1).sum())
print(reviews.isna().sum())

407
listing_id         0
id                 0
date               0
reviewer_id        0
reviewer_name      0
comments         407
dtype: int64


We see that there are NaN values only in 'coments' column

In [7]:
reviews[reviews['comments'].isnull()]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
12637,1094063,141658642,2017-04-04,94260523,Mihnea Cristian,
27445,3117246,536155060,2019-09-26,190308651,Ιωάννης,
29666,3431705,408390363669712904,2021-07-17,344782537,Cosima,
33414,4245658,273166480,2018-06-05,15352587,Alex,
34479,4443443,702927635,2020-10-23,142932343,Sergej,
...,...,...,...,...,...,...
104701,50870387,464974515212855632,2021-10-03,14890280,Martine,
104961,50964566,434507705277780250,2021-08-22,141694244,Alfred,
105863,51387905,431580647027475196,2021-08-18,407884103,Γεωργια,
106232,51729011,442531781646404185,2021-09-02,380942310,Γιωργος,


We are going to delete theese rows because they have no use and they are only 0.1% of rows of the whole dataset

In [8]:
reviews = reviews[reviews['comments'].notna()]

Check that rows are deleted and that there are no more NaN values in the dataframe

In [9]:
print(len(reviews))
print(reviews.isnull().any(axis=1).sum())

406200
0


We print listings dataframe summary statistics on each attribute.

In [10]:
listings.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,bathrooms,...,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,9582.0,9582.0,9582.0,9579.0,9579.0,0.0,9582.0,9582.0,9582.0,0.0,...,9582.0,9582.0,9582.0,9582.0,9582.0,9582.0,9582.0,9582.0,9582.0,7874.0
mean,31899670.0,20211030000000.0,143705200.0,17.809375,17.809375,,37.979937,23.731852,3.829159,,...,53.227719,230.26101,42.43446,8.671885,1.14068,11.083072,9.933312,0.920685,0.097683,1.86097
std,13317090.0,5.019793,115328900.0,60.121461,60.121461,,0.013226,0.012372,1.959972,,...,33.765424,130.262062,69.982756,14.536008,1.92865,23.451291,23.081999,4.082871,1.015924,2.049245
min,10595.0,20211030000000.0,37177.0,0.0,0.0,,37.95055,23.6977,1.0,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.01
25%,22516950.0,20211030000000.0,38499210.0,1.0,1.0,,37.969223,23.724032,2.0,,...,22.0,117.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.43
50%,32781730.0,20211030000000.0,128850200.0,3.0,3.0,,37.978386,23.72983,4.0,,...,66.0,283.0,12.0,2.0,0.0,3.0,2.0,0.0,0.0,1.2
75%,42648480.0,20211030000000.0,224451900.0,11.0,11.0,,37.98796,23.73793,4.0,,...,83.0,350.0,52.0,12.0,2.0,9.0,8.0,0.0,0.0,2.64
max,52960300.0,20211030000000.0,428199300.0,2095.0,2095.0,,38.03243,23.78022,16.0,,...,90.0,365.0,745.0,346.0,20.0,156.0,156.0,43.0,14.0,28.0


We can see that there are columns that have a minimum value of zero (0). On some columns, a value of zero maybe does not make sense and indicates an invalid or missing value. We can also see NaN values that we may have to handle

In [11]:
listings.head(10)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,number_of_reviews_l30d,first_review,last_review,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,10595,https://www.airbnb.com/rooms/10595,20211025162728,2021-10-26,"96m2, 3BR, 2BA, Metro, WI-FI etc...",Athens Furnished Apartment No6 is 3-bedroom ap...,Ampelokipi district is nice multinational and ...,https://a0.muscache.com/pictures/f7e19a44-5afe...,37177,https://www.airbnb.com/users/show/37177,...,0,2015-05-25,2019-04-04,957568.0,t,6,6,0,0,0.41
1,10990,https://www.airbnb.com/rooms/10990,20211025162728,2021-10-25,Athens Quality Apartments - Deluxe Apartment,Athens Quality Apartments - Deluxe apartment i...,Ampelokipi district is nice multinational and ...,https://a0.muscache.com/pictures/8645179/c1728...,37177,https://www.airbnb.com/users/show/37177,...,1,2015-11-25,2016-02-22,1070920.0,t,6,6,0,0,0.72
2,10993,https://www.airbnb.com/rooms/10993,20211025162728,2021-10-25,Athens Quality Apartments - Studio,The Studio is an <br />-excellent located <br ...,Ampelokipi district is nice multinational and ...,https://a0.muscache.com/pictures/107309527/848...,37177,https://www.airbnb.com/users/show/37177,...,3,2015-10-18,2018-03-31,957080.0,t,6,6,0,0,0.97
3,10995,https://www.airbnb.com/rooms/10995,20211025162728,2021-10-25,"AQA-No2 1-bedroom, smart tv, fiber connection,","AQA No2 is 1-bedroom apartment (47m2), on the ...",Ampelokipi district is nice multinational and ...,https://a0.muscache.com/pictures/6a565613-aaa3...,37177,https://www.airbnb.com/users/show/37177,...,0,2015-12-05,2016-08-06,957422.0,t,6,6,0,0,0.33
4,27262,https://www.airbnb.com/rooms/27262,20211025162728,2021-10-26,"54m2, 1-br, cable tv, wi-fi, metro",Big 1-bedroom apartment that can accommodate 4...,,https://a0.muscache.com/pictures/8651803/4b82b...,37177,https://www.airbnb.com/users/show/37177,...,0,2015-11-12,2017-05-15,957579.0,t,6,6,0,0,0.23
5,28186,https://www.airbnb.com/rooms/28186,20211025162728,2021-10-26,❤️Deluxe central loft near Acropolis❤️,TOP 5 REASONS to stay here<br />⭐️Don't waste ...,"Don't look any further. <br />This is, by far,...",https://a0.muscache.com/pictures/5471146/99f09...,121318,https://www.airbnb.com/users/show/121318,...,1,2013-08-12,2021-09-08,264915.0,f,2,2,0,0,4.65
6,31155,https://www.airbnb.com/rooms/31155,20211025162728,2021-10-25,sleep on sailing boat,Sleeping on a boat is the ultimate glamping ex...,,https://a0.muscache.com/pictures/531153/542d3e...,133845,https://www.airbnb.com/users/show/133845,...,0,,,,f,1,0,1,0,
7,33945,https://www.airbnb.com/rooms/33945,20211025162728,2021-10-26,Spacious Cosy aprtm very close to Metro!,Apartment located near metro station. Safe nei...,Neighbourhood is alive all day and safe all da...,https://a0.muscache.com/pictures/1a7a1026-f5f1...,146553,https://www.airbnb.com/users/show/146553,...,0,2015-08-04,2017-07-14,,f,2,2,0,0,0.94
8,49489,https://www.airbnb.com/rooms/49489,20211025162728,2021-10-26,3bdr apt in the heart of Athens,"Fully furnished, 98 sq.mt., apartment on 44 Er...",The apartment is located at the commercial cen...,https://a0.muscache.com/pictures/9640239/4bfd2...,225612,https://www.airbnb.com/users/show/225612,...,0,2014-12-14,2018-08-28,876484.0,t,1,1,0,0,1.45
9,54637,https://www.airbnb.com/rooms/54637,20211025162728,2021-10-25,Penthouse Parthenon view in Athens,"Art penthouse, with a stunnng view in Acropoli...",,https://a0.muscache.com/pictures/5c9190e4-8660...,256676,https://www.airbnb.com/users/show/256676,...,0,2014-08-10,2018-02-01,,f,1,1,0,0,0.03


We can see again some NaN values.Here and also in the describe function, there are some functions that have everywhere NaN values

In [12]:
len(listings)

9582

We remind that we have 9585 rows

## Handling missing values in the dataset

We are checking how many rows have missing values in the dataset for each column

In [13]:
listings.isna().sum()

id                                                 0
listing_url                                        0
scrape_id                                          0
last_scraped                                       0
name                                               3
                                                ... 
calculated_host_listings_count                     0
calculated_host_listings_count_entire_homes        0
calculated_host_listings_count_private_rooms       0
calculated_host_listings_count_shared_rooms        0
reviews_per_month                               1708
Length: 67, dtype: int64

In [14]:
listings.isnull().any(axis=1).sum()

9582

Every row has at least one NaN value. There are 3 columns that are missing a value in every row. Down below we confirm this

In [15]:
listings[['neighbourhood_group_cleansed', 'bathrooms', 'calendar_updated']]

Unnamed: 0,neighbourhood_group_cleansed,bathrooms,calendar_updated
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
9577,,,
9578,,,
9579,,,
9580,,,


Theese 3 columns are going to be deleted. Before we had 67 columns, so after the drop of the columns we must have 64.

In [16]:
len(listings.columns)

67

In [17]:
listings.drop(['neighbourhood_group_cleansed', 'bathrooms', 'calendar_updated'], axis=1, inplace=True)

In [18]:
len(listings.columns)

64

Drop all the columns that contain URLs

In [19]:
listings.drop(['listing_url', 'picture_url', 'host_url', 'host_thumbnail_url', 'host_picture_url', 'id', 'scrape_id',
               'host_id'], axis=1, inplace=True)

Drop all columns with unecessary strings and sentences. Some of them could be handle with NLP techniques, but we are not going to use them here. We also delete columns 'last_scraped', 'calendar_last_scraped' because they have only one unique value

In [20]:
listings.drop(['name', 'description', 'neighborhood_overview', 'host_name', 'host_location', 'host_about', 'license',
               'neighbourhood', 'host_neighbourhood','host_verifications'], axis=1, inplace=True)
listings.drop(['last_scraped', 'calendar_last_scraped'], axis=1, inplace=True)

In [21]:
listings = listings.drop_duplicates() # there are 2 duplicates (rows) and we drop them

The columns 'host_response_rate', 'host_acceptance_rate' have string values that are percentages. So we convert them to float, deleting '%' char and convert the numbers in range (0,1)

In [22]:
print(listings.host_response_rate.head(10))
listings['host_response_rate'] = listings["host_response_rate"].str.rstrip('%').astype('float')/100.0
print(listings.host_response_rate.head(10))

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5    100%
6     NaN
7     NaN
8    100%
9      0%
Name: host_response_rate, dtype: object
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    1.0
6    NaN
7    NaN
8    1.0
9    0.0
Name: host_response_rate, dtype: float64


In [23]:
print(listings.host_acceptance_rate.head(10))
listings['host_acceptance_rate'] = listings["host_acceptance_rate"].str.rstrip('%').astype('float')/100.0
print(listings.host_acceptance_rate.head(10))

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5    100%
6     NaN
7     50%
8      0%
9     NaN
Name: host_acceptance_rate, dtype: object
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    1.0
6    NaN
7    0.5
8    0.0
9    NaN
Name: host_acceptance_rate, dtype: float64


In [24]:
# Number of columns now:
len(listings.columns)

44

See how many columns contain NaN values and store to a list the amount of NaN features for each column

In [25]:
print(listings.isnull().any(axis=0).sum())
nan_on_rows = listings.isnull().sum(axis=1).tolist()

15


Count how many rows have more than 3 (and other amounts) features as NaN

In [26]:
df = listings.dropna(thresh=listings.shape[1]-3)
print('rows with more than 3 NaN values :')
print(len(listings) - len(df))

rows with more than 3 NaN values :
1075


In [27]:
# above 7 in 4 rows
# above 8 in 3 rows
# above 6 in 48 rows
# above 5 in 634 rows

Deleting some rows will decrease the mean absolute error, and after some experiments deleting rows with more than 3 NaN values was the better decision. So we are deleting theese rows.

In [28]:
listings = listings.dropna(thresh=listings.shape[1]-3)
print(df.isnull().any(axis=0).sum()) # how many columns contain NaN values now

9


Fill NaN values. Columns with NaN or numeric values will only be affected. Theese columns are: 'bedrooms', 'beds', 'reviews_per_month' and also 'host_response_rate', 'host_acceptance_rate' . Fill with median of the column

In [29]:
listings[['bedrooms', 'beds', 'reviews_per_month']] = listings[['bedrooms', 'beds', 'reviews_per_month']].\
    fillna(listings.median())
listings[['host_response_rate', 'host_acceptance_rate']] = listings[['host_response_rate', 'host_acceptance_rate']].\
    fillna(listings.median())

  fillna(listings.median())
  fillna(listings.median())


Our target feature is going to be 'price' and we have to convert it to numeric value. We delete the $ signs and we keep only the amount

In [30]:
listings['price'] = [float(x[1:].replace(",", "")) if "," in x else float(x[1:]) for x in listings["price"]]

We are going to convert the values of some columns in categorical values using map

In [31]:
host_response_time_map = {np.nan: 0, "within an hour": 0, "within a few hours": 1, "within a day": 2,
                          "a few days or more": 3}
host_is_superhost_map = {np.nan: 0, "f": 0, "t": 1}
host_has_profile_pic_map = {"f": 0, np.nan: 1, "t": 1}
host_identity_verified_map = {"f": 0, np.nan: 1, "t": 1}
room_type_map = {"Entire home/apt": 0, "Private room": 1, "Shared room": 2, "Hotel room": 3}
has_availability_map = {"f": 0, "t": 1}
instant_bookable_map = {"f": 0, "t": 1}

In [32]:
listings['host_response_time'] = listings['host_response_time'].map(host_response_time_map)
listings['host_is_superhost'] = listings['host_is_superhost'].map(host_is_superhost_map)
listings['host_has_profile_pic'] = listings['host_has_profile_pic'].map(host_has_profile_pic_map)
listings['host_identity_verified'] = listings['host_identity_verified'].map(host_identity_verified_map)
listings['room_type'] = listings['room_type'].map(room_type_map)
listings['has_availability'] = listings['has_availability'].map(has_availability_map)
listings['instant_bookable'] = listings['instant_bookable'].map(instant_bookable_map)

The column 'bathrooms' is in string type, and contains number and text. So, we are going to keep only the number that indicates the amount of bathrooms and convert it to numeric

In [33]:
print(listings['bathrooms_text'].head(5)) # before converting
listings['bathrooms_text'] = listings['bathrooms_text'].str[0]
listings['bathrooms'] = pd.to_numeric(listings['bathrooms_text'], errors='coerce')
listings.drop(['bathrooms_text'], axis=1, inplace=True)
print(listings['bathrooms'].head(5)) # after covverting

0    2 baths
1     1 bath
3     1 bath
4     1 bath
5     1 bath
Name: bathrooms_text, dtype: object
0    2.0
1    1.0
3    1.0
4    1.0
5    1.0
Name: bathrooms, dtype: float64


Now some of theese columns have NaN values and we are going to fill them again with the median of the column

In [34]:
listings[['host_listings_count', 'host_total_listings_count', 'bathrooms']] = listings[['host_listings_count',
                                                                                        'host_total_listings_count',
                                                                                        'bathrooms']].\
                                                                                    fillna(listings.median())

  fillna(listings.median())


In [35]:
298/8505

0.035038212815990594

We have to convert 'neighbourhood_cleansed' and 'property_type' columns in numeric type. So, for each of those columns we replace each unique value, with the total frequency of the value divided by the total amount of rows. For example, in 'neighbourhood_cleansed' column we have the value "ΑΜΠΕΛΟΚΗΠΟΙ" 298 times in 8505 rows. So where we see "ΑΜΠΕΛΟΚΗΠΟΙ" we replace it with 298/8505 = 0.035. Example of both  columns are seen before and after the mapping.

In [36]:
print(listings.neighbourhood_cleansed.head(10))
print(listings.property_type.head(10))

0                          ΑΜΠΕΛΟΚΗΠΟΙ
1                          ΑΜΠΕΛΟΚΗΠΟΙ
3                          ΑΜΠΕΛΟΚΗΠΟΙ
4                          ΑΜΠΕΛΟΚΗΠΟΙ
5               ΕΜΠΟΡΙΚΟ ΤΡΙΓΩΝΟ-ΠΛΑΚΑ
7                       ΑΓΙΟΣ ΝΙΚΟΛΑΟΣ
8               ΕΜΠΟΡΙΚΟ ΤΡΙΓΩΝΟ-ΠΛΑΚΑ
9                             ΠΑΓΚΡΑΤΙ
10    ΑΓΙΟΣ ΚΩΝΣΤΑΝΤΙΝΟΣ-ΠΛΑΤΕΙΑ ΒΑΘΗΣ
11                              ΣΤΑΔΙΟ
Name: neighbourhood_cleansed, dtype: object
0     Entire rental unit
1     Entire rental unit
3     Entire rental unit
4     Entire rental unit
5            Entire loft
7     Entire rental unit
8     Entire rental unit
9     Entire rental unit
10    Entire rental unit
11    Entire rental unit
Name: property_type, dtype: object


In [37]:
neighbourhood_cleansed_map = dict(round(listings["neighbourhood_cleansed"].value_counts() / len(listings), 3))
property_type_map = dict(round(listings["property_type"].value_counts() / len(listings), 3))

listings["neighbourhood_cleansed"] = listings["neighbourhood_cleansed"].map(neighbourhood_cleansed_map)
listings["property_type"] = listings["property_type"].map(property_type_map)

In [38]:
print(listings.neighbourhood_cleansed.head(10))
print(listings.property_type.head(10))

0     0.035
1     0.035
3     0.035
4     0.035
5     0.191
7     0.007
8     0.191
9     0.042
10    0.062
11    0.022
Name: neighbourhood_cleansed, dtype: float64
0     0.730
1     0.730
3     0.730
4     0.730
5     0.017
7     0.730
8     0.730
9     0.730
10    0.730
11    0.730
Name: property_type, dtype: float64


Now, we will fill the NaN values of the columns with date (["host_since", "first_review", "last_review"]). We will do this, based on the 'neighbourhood_cleansed' column. The procedure goes as follows. For each date and for each neighborhood, we consider the subset of the dataframe 
where this specific neighborhood tag appears in "neighbourhood_cleansed" feature, we calculate the median value 
of the date in this subset and we fill the nan values of this date with the previous median value in the specific 
rows.

In [39]:
date_features_nan = ["host_since", "first_review", "last_review"]
for date in date_features_nan:
    for neighbourhood in listings["neighbourhood_cleansed"].unique():
        condition = listings["neighbourhood_cleansed"] == neighbourhood
        # convert the object type into datetime64[ns] and get the median
        neighbourhood_date_series = listings[condition][date]
        neighbourhood_date_series = pd.to_datetime(neighbourhood_date_series)
        median = neighbourhood_date_series.quantile(0.5, interpolation="midpoint")
        # fill the specific nan values
        listings[date] = listings[date].mask(condition, listings[date].fillna(value=str(median)[0:10]))

Since seasonality plays an important role in the price, we create one feature for each one of the remaining date
features which will consist of the seasons. The values 0, 1, 2, 3 stand for spring, summer, autumn and winter 
respectively. Furthermore, we create one more feature which will contain the years for each date feature. 

In [40]:
date_features = ["host_since", "first_review", "last_review"]
for date in date_features:
    years = []
    seasons = []
    for i in range(len(listings)):
        years.append(int(listings[date].iloc[i][0:4]))
        month = listings[date].iloc[i][5:7]
        if month == "03" or month == "04" or month == "05":
            seasons.append(0)
        elif month == "06" or month == "07" or month == "08":
            seasons.append(1)
        elif month == "09" or month == "10" or month == "11":
            seasons.append(2)
        else:
            seasons.append(3)
    # construct the new two features
    listings[date + "_year"] = years
    listings[date + "_season"] = seasons
    # drop the old one
    listings.drop(date, axis=1, inplace=True)

Now, we will handle the 'amenities' column. We will keep only the top 20 amenities we found on question 5.1.

In [41]:
top_20_amenities = {"Essentials": [], "Wifi": [], "Hair dryer": [], "Long term stays allowed": [],
                        "Air conditioning": [], "Hangers": [], "Kitchen": [], "Iron": [],
                        "Heating": [], "Hot water": [], "Dishes and silverware": [], "TV": [],
                        "Cooking basics": [], "Refrigerator": [], "Coffee maker": [], "Dedicated workspace": [],
                        "Shampo": [], "Bed linens": [], "Elevator": [], "Fire extinguisher": []}


For each sample and for each amenity of the previous dictionary, if amenity exists in "amenities", then we give 
the value 1 (it exists). Otherwise, we give the value 0 (it does not exist). When the procedure ends, we drop 
the column "amenities". We are doing multi-hot encoding

In [42]:
for idx in range(len(listings)):
    amenities_idx = listings["amenities"].iloc[idx].split(", ")
    for idy in range(len(amenities_idx)):
        if "[" in amenities_idx:
            amenities_idx[idy] = amenities_idx[idy][2:]
        if "]" in amenities_idx:
            amenities_idx[idy] = amenities_idx[idy][:-2]
        amenities_idx[idy] = amenities_idx[idy][1:-1]
        if idy == 0:
            amenities_idx[idy] = amenities_idx[idy][1:]
        if idy == len(amenities_idx) - 1:
            amenities_idx[idy] = amenities_idx[idy][:-2]
    for amenity in top_20_amenities.keys():
        if amenity in amenities_idx:
            top_20_amenities[amenity].append(1)
        else:
            top_20_amenities[amenity].append(0)
for amenity in top_20_amenities.keys():
    listings["amenity_" + amenity.lower()] = top_20_amenities[amenity]
listings.drop("amenities", axis=1, inplace=True)

Split to X and y and to train and test sets.

In [43]:
y = listings['price']
X = listings.drop('price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [44]:
print(X_train.shape, X_test.shape)

(5953, 65) (2552, 65)


We try to delete rows with outliers. For each column, first it computes the Z-score of each value in the column, relative to the column mean and standard deviation. Then it takes the absolute of Z-score because the direction does not matter, only if it is below the threshold. .all(axis=1) ensures that for each row, all column satisfy the constrain. We put threshold 8 to delete extreme outliers.

In [45]:
temp = len(X_train)
non_outlier_mask = (np.abs(stats.zscore(X_train)) < 7).all(axis=1)
X_train = X_train[non_outlier_mask]
y_train = y_train[non_outlier_mask]
print(temp - len(X_train)) # how many rows got deleted

211


In [46]:
print('our shape in the training set after preprocessing data', X_train.shape)

our shape in the training set after preprocessing data (5742, 65)


In [47]:
print('our shape in the test set after preprocessing data', X_test.shape)

our shape in the test set after preprocessing data (2552, 65)


Now we are going to standardize the columns with StandardScaler()

In [48]:
std_scaler = StandardScaler()

X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)
X_scaled = std_scaler.fit_transform(X)