In [1]:
# installing libraries for use within the project
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# we will use pandas profiling for a full scale analysis of our dataset
import pandas_profiling
import seaborn as sns

In [2]:
# read_csv from pandas is used to read the csv dataset
data = pd.read_csv('train.csv')

In [3]:
# we use DataFrame method in pandas to convert our dataset into pandas data frame
traindata = pd.DataFrame(data)

In [4]:
traindata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2870 entries, 0 to 2869
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   2870 non-null   int64  
 1   region               2870 non-null   object 
 2   latitude             2870 non-null   float64
 3   longitude            2870 non-null   float64
 4   accommodation_type   2870 non-null   object 
 5   cost                 2870 non-null   int64  
 6   minimum_nights       2870 non-null   int64  
 7   number_of_reviews    2870 non-null   int64  
 8   reviews_per_month    2194 non-null   float64
 9   owner_id             2870 non-null   int64  
 10  owned_hotels         2870 non-null   int64  
 11  yearly_availability  2870 non-null   int64  
dtypes: float64(3), int64(7), object(2)
memory usage: 269.2+ KB


In [5]:
# as we can see above in the 'reviews_per_month' column we have some values which are not 'non-null' and we have
# two categorical columns

In [6]:
traindata.head()

Unnamed: 0,id,region,latitude,longitude,accommodation_type,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability
0,13232,Manhattan,40.71854,-74.00439,Entire home/apt,170,5,7,0.56,929983,1,0
1,246,Brooklyn,40.64446,-73.9503,Entire home/apt,65,3,238,2.3,281764,1,0
2,19091,Queens,40.78573,-73.81062,Private room,85,1,0,,19923341,1,1
3,34305,Manhattan,40.73863,-73.98002,Private room,210,30,0,,200380610,65,1
4,444,Manhattan,40.82426,-73.9463,Shared room,75,3,38,0.42,745069,3,1


In [7]:
traindata.describe()

Unnamed: 0,id,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability
count,2870.0,2870.0,2870.0,2870.0,2870.0,2870.0,2194.0,2870.0,2870.0,2870.0
mean,26760.657143,40.731224,-73.950158,195.943206,11.530314,16.315331,1.157502,72021950.0,8.411498,0.498606
std,14140.930062,0.054942,0.049745,406.184714,37.972339,32.481722,1.355028,80765160.0,27.105522,0.500085
min,0.0,40.50708,-74.24285,10.0,1.0,0.0,0.01,2787.0,1.0,0.0
25%,15931.75,40.692463,-73.984003,75.0,1.0,1.0,0.24,7388002.0,1.0,0.0
50%,28946.5,40.72825,-73.95672,120.0,3.0,4.0,0.65,33527080.0,1.0,0.0
75%,38478.5,40.762658,-73.934202,200.0,6.0,16.0,1.53,120762500.0,3.0,1.0
max,48893.0,40.89873,-73.72173,9999.0,999.0,395.0,10.37,273812300.0,327.0,1.0


In [8]:
# for a much better understanding of our dataset we will use the package pandas_profiling
#pandas_profiling.ProfileReport(traindata, explorative=True, minimal = True)

In [9]:
# pip install pyqt5

In [10]:
# for a better depiction of correlation between features
correlation = traindata.corr()
plt.figure(figsize=(18,18))
sns.heatmap(correlation, vmax=1, square = True, annot=True, cmap='viridis')
plt.title('Correlation')
plt.savefig("correlation.png")

In [11]:
# we use get_dummies method in pandas to convert region and accomodation column of the dataset from type:object to type:numerical
# it is easier to work with numerical data as compared to categorical data

In [12]:
traindata = pd.get_dummies(traindata, columns=['region','accommodation_type'])

In [13]:
traindata.head()

Unnamed: 0,id,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability,region_Bronx,region_Brooklyn,region_Manhattan,region_Queens,region_Staten Island,accommodation_type_Entire home/apt,accommodation_type_Private room,accommodation_type_Shared room
0,13232,40.71854,-74.00439,170,5,7,0.56,929983,1,0,0,0,1,0,0,1,0,0
1,246,40.64446,-73.9503,65,3,238,2.3,281764,1,0,0,1,0,0,0,1,0,0
2,19091,40.78573,-73.81062,85,1,0,,19923341,1,1,0,0,0,1,0,0,1,0
3,34305,40.73863,-73.98002,210,30,0,,200380610,65,1,0,0,1,0,0,0,1,0
4,444,40.82426,-73.9463,75,3,38,0.42,745069,3,1,0,0,1,0,0,0,0,1


In [14]:
traindata.dtypes # as we can see there is no categorical data in the dataset now

id                                      int64
latitude                              float64
longitude                             float64
cost                                    int64
minimum_nights                          int64
number_of_reviews                       int64
reviews_per_month                     float64
owner_id                                int64
owned_hotels                            int64
yearly_availability                     int64
region_Bronx                            uint8
region_Brooklyn                         uint8
region_Manhattan                        uint8
region_Queens                           uint8
region_Staten Island                    uint8
accommodation_type_Entire home/apt      uint8
accommodation_type_Private room         uint8
accommodation_type_Shared room          uint8
dtype: object

In [15]:
# also we can see from the Profile Report that the reviews_per_month has 676 missing values. To take care 
# of these missing values we use MICE algorithm to predict the missing values. 

In [16]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

In [17]:
lr=LinearRegression()
imp = IterativeImputer(estimator=lr,missing_values=np.nan, max_iter=10, verbose=2, imputation_order='roman',random_state=0)

In [18]:
traindata_imputed=imp.fit_transform(traindata)

[IterativeImputer] Completing matrix with shape (2870, 18)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.09
[IterativeImputer] Change: 2.019535004232412, scaled tolerance: 273812.306 
[IterativeImputer] Early stopping criterion reached.


In [19]:
print(traindata_imputed)

[[ 1.323200e+04  4.071854e+01 -7.400439e+01 ...  1.000000e+00
   0.000000e+00  0.000000e+00]
 [ 2.460000e+02  4.064446e+01 -7.395030e+01 ...  1.000000e+00
   0.000000e+00  0.000000e+00]
 [ 1.909100e+04  4.078573e+01 -7.381062e+01 ...  0.000000e+00
   1.000000e+00  0.000000e+00]
 ...
 [ 2.348500e+04  4.076619e+01 -7.398987e+01 ...  1.000000e+00
   0.000000e+00  0.000000e+00]
 [ 1.675700e+04  4.074637e+01 -7.397207e+01 ...  0.000000e+00
   1.000000e+00  0.000000e+00]
 [ 2.355000e+03  4.079208e+01 -7.396482e+01 ...  0.000000e+00
   0.000000e+00  1.000000e+00]]


In [20]:
# since our imputed dataset is not of the type pandas dataframe we convert it into one
traindata_imputed= pd.DataFrame(traindata_imputed)

In [21]:
type(traindata_imputed)

pandas.core.frame.DataFrame

In [22]:
traindata_imputed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,13232.0,40.71854,-74.00439,170.0,5.0,7.0,0.56,929983.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,246.0,40.64446,-73.9503,65.0,3.0,238.0,2.3,281764.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,19091.0,40.78573,-73.81062,85.0,1.0,0.0,0.326637,19923341.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,34305.0,40.73863,-73.98002,210.0,30.0,0.0,0.872628,200380610.0,65.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,444.0,40.82426,-73.9463,75.0,3.0,38.0,0.42,745069.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [23]:
# the column names were reset so to get it back we use the below code
traindata_imputed.columns =['id','latitude','longitude','cost','minimum_nights','number_of_reviews','reviews_per_month','owner_id','owned_hotels','yearly_availability','region_Bronx','region_Brooklyn','region_Manhattan','region_Queens','region_StatenIsland','accommodation_type_Entire home/apt','accommodation_type_Private room','accommodation_type_Shared room']


In [24]:
traindata_imputed.head()

Unnamed: 0,id,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability,region_Bronx,region_Brooklyn,region_Manhattan,region_Queens,region_StatenIsland,accommodation_type_Entire home/apt,accommodation_type_Private room,accommodation_type_Shared room
0,13232.0,40.71854,-74.00439,170.0,5.0,7.0,0.56,929983.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,246.0,40.64446,-73.9503,65.0,3.0,238.0,2.3,281764.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,19091.0,40.78573,-73.81062,85.0,1.0,0.0,0.326637,19923341.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,34305.0,40.73863,-73.98002,210.0,30.0,0.0,0.872628,200380610.0,65.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,444.0,40.82426,-73.9463,75.0,3.0,38.0,0.42,745069.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [25]:
# Analyzing the missing values in the columns of dataframe via heatmap
print(traindata_imputed.isnull().sum())

id                                    0
latitude                              0
longitude                             0
cost                                  0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                     0
owner_id                              0
owned_hotels                          0
yearly_availability                   0
region_Bronx                          0
region_Brooklyn                       0
region_Manhattan                      0
region_Queens                         0
region_StatenIsland                   0
accommodation_type_Entire home/apt    0
accommodation_type_Private room       0
accommodation_type_Shared room        0
dtype: int64


In [26]:
# # Visualization, Modeling, Machine Learning

# #Build a model that categorizes hotels on the basis of their yearly availability. Identify how different features influence the decision. Please explain the findings effectively to technical and non-technical audiences using comments and visualizations, if appropriate


# *  **Build and optimized model that effectively solves the business problem** 
# *  **The model will be evaluated on the basis of Accuracy**
# *  **Read the test.csv file and prepare features for testing**


In [27]:
# PCA
# To convert high dimensional data to low dimensional data by keeping only those features 
# that capture maximum information about the dataset.

In [28]:
#splicing the dataset into input dataframe and target dataframe
xtrain= traindata_imputed.loc[:, traindata_imputed.columns != 'yearly_availability']

ytrain = traindata_imputed.loc[:, traindata_imputed.columns == 'yearly_availability']

In [29]:
# carrying our feature importance
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# bestfeatures = SelectKBest(score_func=chi2, k=10)
# fit = bestfeatures.fit(xtrain, ytrain)
# dfscores= pd.DataFrame(fit.scores)
# dfcolumns = pd.DataFrame(xtrain.columns)
# featureScores = pd.concat([dfcolumns,dfscores],axis=1)
# featureScores.columns = ['Specs','Score']  #naming the dataframe columns
# print(featureScores.nlargest(17,'Score'))  #print 17 best features
import matplotlib
matplotlib.use('TkAgg')
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(xtrain,ytrain.values.ravel())
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
xtrain_pd = pd.DataFrame(xtrain)
xtrain_pd.columns =['id','latitude','longitude','cost','minimum_nights','number_of_reviews','reviews_per_month','owner_id','owned_hotels','region_Bronx','region_Brooklyn','region_Manhattan','region_Queens','region_StatenIsland','accommodation_type_Entire home/apt','accommodation_type_Private room','accommodation_type_Shared room']
print(xtrain_pd)
feat_importances = pd.Series(model.feature_importances_, index=xtrain_pd.columns)
feat_importances.plot(kind='barh')
plt.xlabel('Score')
plt.ylabel('Features')
plt.show()
plt.savefig('feature_importance.png')

[0.06059058 0.03627463 0.03688646 0.03483069 0.03721809 0.05692662
 0.04675312 0.04169379 0.09428177 0.00531118 0.00503382 0.00627085
 0.00587184 0.00031904 0.17990841 0.3022982  0.04953091]
           id  latitude  longitude    cost  minimum_nights  number_of_reviews  \
0     13232.0  40.71854  -74.00439   170.0             5.0                7.0   
1       246.0  40.64446  -73.95030    65.0             3.0              238.0   
2     19091.0  40.78573  -73.81062    85.0             1.0                0.0   
3     34305.0  40.73863  -73.98002   210.0            30.0                0.0   
4       444.0  40.82426  -73.94630    75.0             3.0               38.0   
...       ...       ...        ...     ...             ...                ...   
2865  47852.0  40.74316  -73.98038   400.0             2.0                0.0   
2866  21978.0  40.73523  -73.99465   180.0             3.0                2.0   
2867  23485.0  40.76619  -73.98987   179.0             3.0               17.0   

In [30]:
xtrain.head()

Unnamed: 0,id,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,region_Bronx,region_Brooklyn,region_Manhattan,region_Queens,region_StatenIsland,accommodation_type_Entire home/apt,accommodation_type_Private room,accommodation_type_Shared room
0,13232.0,40.71854,-74.00439,170.0,5.0,7.0,0.56,929983.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,246.0,40.64446,-73.9503,65.0,3.0,238.0,2.3,281764.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,19091.0,40.78573,-73.81062,85.0,1.0,0.0,0.326637,19923341.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,34305.0,40.73863,-73.98002,210.0,30.0,0.0,0.872628,200380610.0,65.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,444.0,40.82426,-73.9463,75.0,3.0,38.0,0.42,745069.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [31]:
xtrain.shape

(2870, 17)

In [32]:
ytrain.shape

(2870, 1)

In [33]:
# prior to applying Principal component analysis we scale our dataset using a standard scaler
from sklearn.preprocessing import StandardScaler

In [34]:
scaler = StandardScaler()
xtrain= scaler.fit_transform(xtrain)
xtrain

array([[-0.95686879, -0.23089729, -1.09040816, ...,  1.36538326,
        -0.86294893, -0.53733531],
       [-1.87535597, -1.57946759, -0.00286314, ...,  1.36538326,
        -0.86294893, -0.53733531],
       [-0.54246741,  0.99224578,  2.80557258, ..., -0.73239509,
         1.15881713, -0.53733531],
       ...,
       [-0.23168405,  0.63653488, -0.79846596, ...,  1.36538326,
        -0.86294893, -0.53733531],
       [-0.70754896,  0.27572679, -0.44057538, ..., -0.73239509,
         1.15881713, -0.53733531],
       [-1.72618845,  1.10784272, -0.29480534, ..., -0.73239509,
        -0.86294893,  1.86103535]])

In [35]:
# we will use PCA  to select a subset of variables from a larger set, based on which 
# original variables have the highest correlation with the target
from sklearn.decomposition import PCA

In [36]:
pca = PCA(n_components = 17)
pca.fit(xtrain)

PCA(n_components=17)

In [37]:
var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)

In [38]:
var

array([ 15.1,  27.8,  40.2,  50.3,  58.3,  65. ,  71.5,  77.5,  83.2,
        88.4,  93.2,  96.1,  97.7,  98.9, 100. , 100. , 100. ])

In [39]:
plt.ylabel('% Variance Explained')
plt.xlabel('No. of features')
plt.title(' PCA Analysis')
plt.style.context('seaborn-whitegrid')
plt.plot(var)

[<matplotlib.lines.Line2D at 0x7f962519ac70>]

In [40]:
xtrain.shape

(2870, 17)

In [41]:
# preparing the test dataset

In [42]:
testdata = pd.read_csv('test.csv')

In [43]:
testdata.head()

Unnamed: 0,id,region,latitude,longitude,accommodation_type,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels
0,19215,Brooklyn,40.70912,-73.94513,Shared room,135,2,22,0.66,4360212,1
1,36301,Brooklyn,40.57646,-73.96641,Entire home/apt,69,2,8,0.9,181356989,2
2,40566,Manhattan,40.76616,-73.98228,Private room,225,30,0,,13773574,12
3,33694,Manhattan,40.77668,-73.94587,Shared room,125,30,9,0.82,6788748,1
4,28873,Manhattan,40.80279,-73.9445,Entire home/apt,43,1,13,0.72,105061915,2


In [44]:
testdata = pd.DataFrame(testdata)

In [45]:
testdata = pd.get_dummies(testdata, columns=['region','accommodation_type'])

In [46]:
testdata.head()

Unnamed: 0,id,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,region_Bronx,region_Brooklyn,region_Manhattan,region_Queens,region_Staten Island,accommodation_type_Entire home/apt,accommodation_type_Private room,accommodation_type_Shared room
0,19215,40.70912,-73.94513,135,2,22,0.66,4360212,1,0,1,0,0,0,0,0,1
1,36301,40.57646,-73.96641,69,2,8,0.9,181356989,2,0,1,0,0,0,1,0,0
2,40566,40.76616,-73.98228,225,30,0,,13773574,12,0,0,1,0,0,0,1,0
3,33694,40.77668,-73.94587,125,30,9,0.82,6788748,1,0,0,1,0,0,0,0,1
4,28873,40.80279,-73.9445,43,1,13,0.72,105061915,2,0,0,1,0,0,1,0,0


In [47]:
testdata=imp.fit_transform(testdata)

[IterativeImputer] Completing matrix with shape (718, 17)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.03
[IterativeImputer] Change: 2.3050971097385933, scaled tolerance: 273546.395 
[IterativeImputer] Early stopping criterion reached.


In [48]:
print(testdata)

[[ 1.921500e+04  4.070912e+01 -7.394513e+01 ...  0.000000e+00
   0.000000e+00  1.000000e+00]
 [ 3.630100e+04  4.057646e+01 -7.396641e+01 ...  1.000000e+00
   0.000000e+00  0.000000e+00]
 [ 4.056600e+04  4.076616e+01 -7.398228e+01 ...  0.000000e+00
   1.000000e+00  0.000000e+00]
 ...
 [ 3.138300e+04  4.071390e+01 -7.399120e+01 ...  1.000000e+00
   0.000000e+00  0.000000e+00]
 [ 4.713500e+04  4.074399e+01 -7.397204e+01 ...  0.000000e+00
   1.000000e+00  0.000000e+00]
 [ 1.315400e+04  4.070749e+01 -7.401413e+01 ...  0.000000e+00
   1.000000e+00  0.000000e+00]]


In [49]:
testdata= pd.DataFrame(testdata)

In [50]:
testdata.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,19215.0,40.70912,-73.94513,135.0,2.0,22.0,0.66,4360212.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,36301.0,40.57646,-73.96641,69.0,2.0,8.0,0.9,181356989.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,40566.0,40.76616,-73.98228,225.0,30.0,0.0,1.157783,13773574.0,12.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33694.0,40.77668,-73.94587,125.0,30.0,9.0,0.82,6788748.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,28873.0,40.80279,-73.9445,43.0,1.0,13.0,0.72,105061915.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [51]:
testdata.columns =['id','latitude','longitude','cost','minimum_nights','number_of_reviews','reviews_per_month','owner_id','owned_hotels','region_Bronx','region_Brooklyn','region_Manhattan','region_Queens','region_StatenIsland','accommodation_type_Entire home/apt','accommodation_type_Private room','accommodation_type_Shared room']
                   

In [52]:
print(testdata)

          id  latitude  longitude   cost  minimum_nights  number_of_reviews  \
0    19215.0  40.70912  -73.94513  135.0             2.0               22.0   
1    36301.0  40.57646  -73.96641   69.0             2.0                8.0   
2    40566.0  40.76616  -73.98228  225.0            30.0                0.0   
3    33694.0  40.77668  -73.94587  125.0            30.0                9.0   
4    28873.0  40.80279  -73.94450   43.0             1.0               13.0   
..       ...       ...        ...    ...             ...                ...   
713  26801.0  40.84590  -73.91558   77.0             2.0               36.0   
714  20110.0  40.71026  -73.94744  145.0             3.0               43.0   
715  31383.0  40.71390  -73.99120   90.0             2.0               60.0   
716  47135.0  40.74399  -73.97204  200.0            30.0                0.0   
717  13154.0  40.70749  -74.01413  169.0            30.0                2.0   

     reviews_per_month     owner_id  owned_hotels  

In [53]:
testdata.head()

Unnamed: 0,id,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,region_Bronx,region_Brooklyn,region_Manhattan,region_Queens,region_StatenIsland,accommodation_type_Entire home/apt,accommodation_type_Private room,accommodation_type_Shared room
0,19215.0,40.70912,-73.94513,135.0,2.0,22.0,0.66,4360212.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,36301.0,40.57646,-73.96641,69.0,2.0,8.0,0.9,181356989.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,40566.0,40.76616,-73.98228,225.0,30.0,0.0,1.157783,13773574.0,12.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33694.0,40.77668,-73.94587,125.0,30.0,9.0,0.82,6788748.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,28873.0,40.80279,-73.9445,43.0,1.0,13.0,0.72,105061915.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [54]:
testdata.shape

(718, 17)

In [55]:
xtest = testdata.loc[:,testdata.columns != 'yearly_availability']
ytest = testdata.loc[:,testdata.columns == 'yearly_availability']

In [56]:
xtest.shape

(718, 17)

In [57]:
xtest=scaler.fit_transform(xtest)
xtest

array([[-0.61386364, -0.35992271,  0.16580865, ..., -0.72416582,
        -0.81602284,  1.7035768 ],
       [ 0.59247285, -2.7409478 , -0.2830036 , ...,  1.3808992 ,
        -0.81602284, -0.58700025],
       [ 0.89359805,  0.66384988, -0.61771462, ..., -0.72416582,
         1.2254559 , -0.58700025],
       ...,
       [ 0.24524337, -0.2741297 , -0.80584456, ...,  1.3808992 ,
        -0.81602284, -0.58700025],
       [ 1.35739439,  0.26593547, -0.40174481, ..., -0.72416582,
         1.2254559 , -0.58700025],
       [-1.04179325, -0.38917848, -1.28945664, ..., -0.72416582,
         1.2254559 , -0.58700025]])

In [58]:
pca.fit(xtest)

PCA(n_components=17)

In [59]:
# pip install imblearn

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pca = PCA(n_components=17)
logReg = LogisticRegression() 

pipe = Pipeline([('pca', pca), ('logistic', logReg)])
logReg.fit(xtrain, ytrain)
y_pred = logReg.predict(xtest)



  y = column_or_1d(y, warn=True)


In [61]:
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [None]:
regressor = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=3
)

In [None]:
pca = PCA(n_components=17)


Task: Submit the predictions on the test dataset using your optimized model For each record in the test set(test.csv) predict the value of the yearly_availability variable. Submit a CSV file with a header row and one row per test entry.

The file (submissions.csv) should have exactly 2 columns:

id
yearly_availability

In [None]:
submission_df = pd.DataFrame(data={"id":testdata["id"],"yearly_availability":y_pred})

In [None]:
submission_df.to_csv('submisssions.csv', index=False)

In [None]:
submission_data = pd.read_csv('submisssions.csv')

In [None]:
submission_data.head()