In [23]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,mean_absolute_error,mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from ucimlrepo import fetch_ucirepo 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_iris
from sklearn.svm import SVC,SVR
from sklearn.datasets import make_classification,make_regression
import joblib
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# Import for classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# Import for regression
from sklearn.tree import DecisionTreeRegressor


In [24]:
def eda(df, target, type='a'):  # Exploratory Data Analysis


    # Display unique label count
    unique_count = df[target].nunique()
    print("Label Count =", unique_count)

    # Type 'a': Standard preprocessing
    if type == 'a':
        for col in df.select_dtypes(include=['object']).columns:
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])
        # Handle missing values
        print(df.isnull().sum())
        df.fillna(df.mean(), inplace=True)  
        x = df.drop(columns=[target])
        y = df[target]
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

        # Apply StandardScaler and MinMaxScaler
        sc = StandardScaler()
        x_train = sc.fit_transform(x_train)
        x_test = sc.transform(x_test)

        scaler = MinMaxScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

        print(df.shape)
        return x_train, x_test, y_train, y_test

    # Type 'b': TF-IDF for text data
    if type == 'b':
        df.dropna(inplace=True)
        x = df[target].astype(str)  # Use only the text column for vectorization
        y = df[target]

        # Ensure labels are categorical
        if not y.dtype == 'object':
            label_encoder = LabelEncoder()
            y = label_encoder.fit_transform(y)

        x_train_raw, x_test_raw, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

        vectorizer = TfidfVectorizer()
        x_train = vectorizer.fit_transform(x_train_raw)
        x_test = vectorizer.transform(x_test_raw)

        return x_train, x_test, y_train, y_test


In [25]:
base_path= '/home/mr-arthor/Desktop/CDAC/Practical_Machine_Learning/Lab_Work/Datasets'
Launch_Path= 'Models/'
os.makedirs(Launch_Path, exist_ok=True)


In [26]:

def SVR_Func(x_train, x_test, y_train, y_test, parameters, CV, Ver, name,bypass=True):
    model_path = Launch_Path + name + '.pkl'
    
    if os.path.exists(model_path) and bypass:
        print("Model already trained and saved at", model_path)
    else:
         
        grid_search = RandomizedSearchCV(SVR(),
                                            param_distributions=parameters,
                                            n_iter=50,
                                          
                                          n_jobs=-1, cv=CV, verbose=Ver)
        grid_search.fit(x_train, y_train)
        print(grid_search.best_params_)
        y_pred = grid_search.predict(x_test)
        print("Mean Square Error", mean_squared_error(y_test, y_pred))
        print("Mean Absolute Error", mean_absolute_error(y_test, y_pred))
        print("R2 Score", r2_score(y_test, y_pred))
        best_model = grid_search.best_estimator_
        joblib.dump(best_model, model_path)
        plt.figure(figsize=(10, 6))
        print("Best model saved to", model_path)


In [27]:
df_AirBnb = pd.read_csv(base_path+'/Airbnb_Open_Data.csv')


  df_AirBnb = pd.read_csv(base_path+'/Airbnb_Open_Data.csv')


In [28]:
df_AirBnb.drop(columns=['id','NAME','host id','host name','last review','reviews per month'],inplace=True)


In [29]:
for col in df_AirBnb.select_dtypes(include=['object']).columns:
    label_encoder = LabelEncoder()
    df_AirBnb[col] = label_encoder.fit_transform(df_AirBnb[col])


In [30]:
print(df_AirBnb.isnull().sum())
df_AirBnb.fillna(df_AirBnb.mean(), inplace=True)  


host_identity_verified              0
neighbourhood group                 0
neighbourhood                       0
lat                                 8
long                                8
country                             0
country code                        0
instant_bookable                    0
cancellation_policy                 0
room type                           0
Construction year                 214
price                               0
service fee                         0
minimum nights                    409
number of reviews                 183
review rate number                326
calculated host listings count    319
availability 365                  448
house_rules                         0
license                             0
dtype: int64


In [31]:
X = df_AirBnb.iloc[:, :-1].values  # Features
y = df_AirBnb['price'].values   


In [32]:
pca = PCA(n_components=4)  # Specify the number of principal components
principalComponents = pca.fit_transform(X)


In [33]:
principalDf = pd.DataFrame(data=principalComponents,
                           columns=['Principal Component 1',
                                    'Principal Component 2',
                                    'Principal Component 3',
                                    'Principal Component 3'])


In [34]:
finalDf = pd.concat([principalDf, pd.DataFrame(y, columns=['price'])], axis=1)


In [35]:
finalDf.head()


Unnamed: 0,Principal Component 1,Principal Component 2,Principal Component 3,Principal Component 3.1,price
0,-1032.655144,540.680278,152.300851,4.495268,1114
1,-262.679852,-334.693617,88.306132,22.066438,243
2,-894.871785,169.589669,215.704071,-8.560195,734
3,497.941345,-118.223982,187.579667,-77.716649,469
4,-112.163496,-275.190575,147.365248,-42.676316,305


In [36]:
x_train, x_test, y_train, y_test = eda(finalDf,'price')


Label Count = 1152
Principal Component 1    0
Principal Component 2    0
Principal Component 3    0
Principal Component 3    0
price                    0
dtype: int64
(102599, 5)


In [37]:
parameters = [{'C': [0.1, 1, 10, 100, 1000], 'kernel': ['rbf']}] #parameters for SVR


In [38]:
SVR_Func(x_train, x_test, y_train, y_test,parameters,6,3,'AirBnb',False) #SVR Function


Fitting 6 folds for each of 5 candidates, totalling 30 fits




KeyboardInterrupt: 