# Data Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd

# Create a sample dataframe with a categorical variable
df = pd.DataFrame({'Color': ['Red', 'Green', 'Blue', 'Green', 'Red']})
print("Original dataframe:")
print(df)

# One-hot encoding using sklearn OneHotEncoder
one_hot_encoder = OneHotEncoder()
one_hot_encoded = one_hot_encoder.fit_transform(df[['Color']])
one_hot_df = pd.DataFrame(one_hot_encoded.toarray(), columns=one_hot_encoder.get_feature_names_output(['Color']))
print("\nOne-hot encoded dataframe:")
print(one_hot_df)

# Label encoding using sklearn LabelEncoder
label_encoder = LabelEncoder()
df['Color_LabelEncoded'] = label_encoder.fit_transform(df['Color'])
print("\nLabel encoded dataframe:")
print(df)


# Min Max Scalling

https://towardsdatascience.com/data-normalization-with-pandas-and-scikit-learn-7c1cc6ed6475

In [1]:
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
print(scaler.fit_transform(data))

[[0.   0.  ]
 [0.25 0.25]
 [0.5  0.5 ]
 [1.   1.  ]]


# Z-score Scalling(Standard Scalling)

In [None]:
from sklearn.preprocessing import StandardScaler
# create a scaler object
std_scaler = StandardScaler()
std_scaler
# fit and transform the data
df_std = pd.DataFrame(std_scaler.fit_transform(df_cars), columns=df_cars.columns)

df_std

# For Libraries

In [None]:
import pyreadstat #to read and write sas (sas7bdat, sas7bcat, xport), spps (sav, zsav, por)
#and stata (dta) data files into/from pandas dataframes
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

# Setting environment to ignore future warnings
import warnings
warnings.simplefilter('ignore')

# Changing default pandas setting to custom
from matplotlib.pylab import rcParams
rcParams["figure.figsize"] = (14, 7)
pd.set_option('max_columns', 30)
pd.set_option('max_rows', 20)
%matplotlib inline

# For Analysis

In [None]:
# Function to perform all EDA
def perform_eda(df, name=""):
    # Printing basic detail of data like name, size, shape
    print(f"EDA of {str(name)} Data....")
    print(f"Size {df.size}")
    print(f"Columns {df.shape[1]}")
    print(f"Records {df.shape[0]}")
    print("="*50)
    
    # Printing top 5 records of data
    print("First Look of Data....")
    display(df.head())
    print("="*50)
    
    # Getting Numerical and Categorical columns Separately
    cat_cols = df.select_dtypes(np.object).columns
    num_cols = df.select_dtypes(np.number).columns

    # Printing the Numerical columns
    print("Dataset has following Numerical columns...")
    for i, j in enumerate(num_cols):
        print(f" {i+1}) {j}")

    # Printing the Categorical columns
    print("\n\nDataset has following Categorical columns...")
    for i, j in enumerate(cat_cols):
        print(f" {i+1}) {j}")
    
    # Printing info of data like data type, non null values
    print("="*50)
    print("Information of Data....")
    print(df.info())
    print("="*50)
    
    # Displaying statistical properties of data like mean, median, max, min
    print("Statistical Properties of Data....")
    display(df.describe(include="all"))
    print("="*50)
    
    # Displaying correlation of numerical features
    corr = df.corr(method="kendall").style.background_gradient("YlOrRd_r")
    print("Correlation of Numerical features....")
    display(corr)

In [None]:
# Printing the name of columns
print("Dataset has following columns...")
for i, j in enumerate(df.columns):
    print(f" {i+1}) {j}")

In [None]:
# Getting Numerical and Categorical columns Separately
cat_cols = df.select_dtypes(np.object).columns
num_cols = df.select_dtypes(np.number).columns

# Printing the Numerical columns
print("Dataset has following Numerical columns...")
for i, j in enumerate(num_cols):
    print(f" {i+1}) {j}")

# Printing the Categorical columns
print("\n\nDataset has following Categorical columns...")
for i, j in enumerate(cat_cols):
    print(f" {i+1}) {j}")

In [None]:
# Percentage on bar
def per_on_bar(feature, title="", limited=False, n=10):
    print("Total unique values are: ", len(feature.value_counts()), "\n\n")
    print("Category\tValue\n")
    if limited:
        data = feature.value_counts()[0:n]
    else:
        data = feature.value_counts()
    print(data)
    categories_num = len(data)
    #plotting bar-plot and pie chart
    sns.set_style('darkgrid')
    plt.figure(figsize=(16,5))
    plt.title(title, fontsize=16)
    plt.xticks(rotation=45)
    plot = sns.barplot(x=data.index, y=data.values, edgecolor="black", palette=sns.palettes.color_palette("icefire"))
    total = len(feature)
    for p in plot.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2 - 0.08
        y = p.get_y() + p.get_height()
        plot.annotate(percentage, (x, y), size = 12)
    plt.show()

In [None]:
# Function to explore continous features
def explore_feature(feature_name):
    # Printing details
    print(f"Exploring {str(feature_name).upper()}........")
    print(f"Mean of {feature_name}     : {df_train[feature_name].mean()}")
    print(f"Median of {feature_name}   : {df_train[feature_name].median()}")
    print(f"Mode of {feature_name}     : {df_train[feature_name].mode()}")
    print(f"Variance of {feature_name} : {df_train[feature_name].var()}")
    print(f"Skewness of {feature_name} : {df_train[feature_name].skew()}")
    print(f"Maximum of {feature_name}  : {df_train[feature_name].max()}")
    print(f"Minimum of {feature_name}  : {df_train[feature_name].min()}")
    temp = df_train[[feature_name, "Sepssis"]]
    temp.Sepssis = temp.Sepssis.map({"Negative": 0, "Positive": 1})
    corr = temp.corr().iloc[0, 1]
    print(f"Correlation with the target feature : {corr}")
    temp = df_train[df_train[feature_name] > df_train[feature_name].mean()]
    ratio = len(temp[temp.Sepssis == "Positive"])/len(temp)
    print(f"Ratio of being Sepssis for the patient whose {feature_name} is more than average value : {ratio}")
    temp = df_train[df_train[feature_name] < df_train[feature_name].mean()]
    ratio = len(temp[temp.Sepssis == "Positive"])/len(temp)
    print(f"Ratio of being Sepssis for the patient whose {feature_name} is less than average value : {ratio}")
    
    # Drawing plots
    plt.figure(figsize=(17, 4))
    fig=plt.figure(figsize=(17, 4))
    plt.subplot(131)
    sns.kdeplot(df_train[feature_name])
    
    plt.subplot(132)
    sns.boxplot(df_train[feature_name])
    
    plt.subplot(133)
    sns.scatterplot(df_train.index, df_train[feature_name], hue=df_train.Sepssis)
    fig.suptitle("Plots for Whole data")
    plt.show()
    
    temp = df_train[df_train.Sepssis == "Positive"]
    # Drawing plots
    plt.figure(figsize=(17, 4))
    fig=plt.figure(figsize=(17, 4))
    plt.subplot(131)
    sns.kdeplot(temp[feature_name])
    
    plt.subplot(132)
    sns.boxplot(temp[feature_name])
    
    plt.subplot(133)
    sns.scatterplot(temp.index, temp[feature_name], hue=temp.Sepssis)
    fig.suptitle("Plots for Sepssis Patients")
    plt.show()

In [None]:
def make_plots(feature, title="", limited=False, n=10):
    print("Total unique values are: ", len(feature.value_counts()), "\n\n")
    print("Category\tValue\n")
    if limited:
        data = feature.value_counts()[0:n]
    else:
        data = feature.value_counts()
    print(data)
    categories_num = len(data)
    #plotting bar-plot and pie chart
    sns.set_style('darkgrid')
    plt.figure(figsize=(16,5))
    plt.subplot(1,2,1)
    plt.title(title, fontsize=16)
    plt.xticks(rotation=45)
    plot = sns.barplot(x=data.index, y=data.values, edgecolor="white", palette=sns.palettes.color_palette("icefire"))
    total = len(feature)
    for p in plot.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2 - 0.08
        y = p.get_y() + p.get_height()
        plot.annotate(percentage, (x, y), size = 12)
    
    plt.subplot(1,2,2)
    labels = data.index
    plt.pie(x=data, autopct="%.1f%%", explode=[0.02]*categories_num, labels=labels, pctdistance=0.5)
    plt.title(title, fontsize=16)
    plt.show()

In [None]:
def make_bar_plot(x, y, title, xlable=None, ylable=None):
    plt.figure(figsize=(15, 5))
    sns.barplot(x, y)
    plt.title(title, fontsize=18)
    plt.xlabel(xlable, fontsize=14)
    plt.ylabel(ylable, fontsize=14)
    plt.xticks(rotation=65)
    plt.show()

In [None]:
def make_plots(x, y, title="", xlable="", ylable="", palette="Blues_d"):    
    #plotting bar-plot and pie chart
    sns.set_style('darkgrid')
    plt.figure(figsize=(16,5))
    plt.subplot(1,2,1)
    sns.barplot(x, y, palette=palette)
    plt.title(title, fontsize=14)
    plt.xlabel(xlable, fontsize=12)
    plt.ylabel(ylable, fontsize=12)
    plt.xticks(rotation=65)
    
    plt.subplot(1,2,2)
    categories_num = len(x)
    plt.pie(x=y, autopct="%.1f%%", explode=[0.08]*categories_num, labels=x, pctdistance=0.5)
    plt.title(title, fontsize=14)
    plt.show()

In [None]:
def plot_wordcloud(text):
    from wordcloud import WordCloud
    # Generating WordCloud
    comment_words = ''
    stopwords = nltk.corpus.stopwords.words("english")

    # iterate through the csv file
    for val in text:
        # typecaste each val to string
        val = str(val)
        # split the value
        tokens = val.split()

        # Converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()

        comment_words += " ".join(tokens)+" "

    wordcloud = WordCloud(width = 1200, height = 700, background_color ='black',
          stopwords = stopwords,
          min_font_size = 10).generate(comment_words)

    # plot the WordCloud image
    plt.figure(figsize = (12, 7), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()

In [None]:
# Function to make plots for numerical feature
def make_plots(feature):
    # Setting figure size
    plt.figure(figsize=(13, 5))
    
    # Making KDE plot
    plt.subplot(131)
    sns.kdeplot(feature)
    
    # Making boxen plot
    plt.subplot(132)
    sns.boxenplot(feature)
    
    # Scatter plot to check relation with target
    plt.subplot(121)
    sns.scatterplot(feature, df.High)
    plt.show()

In [None]:
# Balancing data
from imblearn.over_sampling import SMOTE
sampler = SMOTE()
X, y = sampler.fit_resample(X, y)

In [None]:
# A custom function defined in order to fine-tune the cleaning of the input text. This function is highly dependent on each usecase.
# Note: Only include misspelling or abbreviations of commonly used words.
#       Including many minimally present cases would negatively impact the performance. 
def clean_abbreviation(token):
    if token == 'u':
        return 'you'
    if token == 'r':
        return 'are'
    if token == 'some1':
        return 'someone'
    if token == 'yrs':
        return 'years'
    if token == 'hrs':
        return 'hours'
    if token == 'mins':
        return 'minutes'
    if token == 'secs':
        return 'seconds'
    if token == 'pls' or token == 'plz':
        return 'please'
    if token == '2morow':
        return 'tomorrow'
    if token == '2day':
        return 'today'
    if token == '4got' or token == '4gotten':
        return 'forget'
    if token == 'amp' or token == 'quot' or token == 'lt' or token == 'gt' or token == '½25':
        return ''
    return token

import re
import nltk
nltk.download("wordnet")
nltk.download("stopwords")

STOPWORDS = nltk.corpus.stopwords.words("english") 
lemmatizer = nltk.stem.WordNetLemmatizer()

# Function to clean the data
def clean_data(txt):
    txt = txt.lower()                             # lowering text
    txt = re.sub(r'#', '', txt)                   # Removing hashtags
    txt = re.sub(r'@[A-Za-z0-9]+', '', txt)       # Removing Mentions
    txt = re.sub(r'https?:\/\/\S+', '', txt)      # Removing Links
    txt = re.sub(r'RT[\s]+', '', txt)             # Removing Retweets
    txt = re.sub(r'\n', ' ', txt)                 # Removing Newline
    txt = re.sub(r"[^a-zA-Z0-9]"," ", txt)        # Removing all special characters
    txt = " ".join([clean_abbreviation(i) for i in txt.split()])                           # Checking for abbreviations
    txt = " ".join([lemmatizer.lemmatize(i) for i in txt.split() if i not in STOPWORDS])   # Removing stopwrods and applying lemmatization
    return txt

# For Next Procedure

In [None]:
# lets try to check the percentage of missing values,unique values,percentage of one catagory values and type against each column.
def statistics(df):
    stats = []
    for col in df.columns:
        stats.append((col, df[col].nunique(), df[col].isnull().sum(), df[col].isnull().sum() * 100 / df.shape[0], df[col].dtype))

    stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Missing values', 'Percentage of Missing Values', 'Data Type'])
    stats_df.set_index('Feature', drop=True, inplace=True)
    stats_df.drop(stats_df[stats_df['Missing values'] == 0].index, axis=0, inplace=True)
    stats_df.sort_values('Percentage of Missing Values', ascending=False, inplace=True)
    return stats_df

statistics(df)

In [None]:
from scipy.stats import normaltest

# Function to make kde plot
def make_kde(feature):
    plt.figure(figsize=(12, 5))
    sns.kdeplot(feature)
    plt.show()

# Iterating Numerical columns
for i in num_cols:
    # Finding Normality of feature
    st, p = normaltest(train[i].dropna())
    
    # Checking if normal or not
    if p > 0.05:
        print("Normal")
        # Filling with mean if normal
        train[i].fillna(train[i].mean(), inplace=True)
        test[i].fillna(test[i].mean(), inplace=True)
    else:
        print("Not Normal")
        # Filling with median if not normal
        train[i].fillna(train[i].median(), inplace=True)
        test[i].fillna(test[i].median(), inplace=True)  
    
    # making kde plot
    make_kde(train[i])

In [None]:
df_n = df.copy()

# lets try to remove the outliers
for x in df[cols].columns.tolist():
    q75,q25 = np.percentile(df.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_n.loc[df[x] < min,x] = np.nan
    df_n.loc[df[x] > max,x] = np.nan

# lets try to check the sum of count of NULL values/outliers in each column of the dataset
print(df_n.isnull().sum())
df1 = df_n.dropna(axis = 0)

In [None]:
df1 = df_n.dropna(axis = 0)
print(df1.isnull().sum())
print()
print("Shape :",df1.shape)

In [None]:
# Finding Correlation of features 
plt.figure(figsize=(18, 12))
sns.heatmap(round(df.corr(), 2), annot=True, vmin=-1, vmax=1, cbar=False)
plt.show()

In [None]:
# check VIF scores
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["feature"] = house.columns

vif_data["VIF"] = np.round([variance_inflation_factor(house.values, i) for i in range(len(house.columns))], 2)
vif_data

In [None]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

    return dataset

In [None]:
# defining Tuning class
class RandomSearchCV:
    # Defining constructor
    def __init__(self, model, parameter_dictionary, n_iter):
        # Creating class memebers
        self.model = model
        self.params = parameter_dictionary
        self.n_iter = n_iter
        self.best_params_ = None
        self.best_score_ = 0
        self.params_space = []
        
        # Generating parameters space
        keys = list(self.params.keys())
        values = list(self.params.values())
        for i in values[0]:
            for j in values[1]:
                temp = list([dict({keys[0]: i}), dict({keys[1]: j})])
                self.params_space.append(temp)
        
        # Selecting n parameters pairs
        rand = np.random.randint(0, len(self.params_space), self.n_iter)
        self.select_params = [self.params_space[i] for i in rand]
        
    # Deefining fit function
    def fit(self, X_train, y_train, X_test, y_test):
        from tqdm import tqdm
        from sklearn.metrics import accuracy_score
        
        # Training model on all selected params
        for i in tqdm(self.select_params):
            model = self.model
            for j in i:
                p = list(j.keys())[0]
                model.p = list(j.values())[0]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            if score > self.best_score_:
                self.best_score_ = score
                self.best_params_ = i
    
    # Function to take parameter space
    def get_param_space(self):
        return self.params_space

    # Function to take parameter space
    def get_selected_param_space(self):
        return self.select_params
    

# Defining parameters values
params = {
    "max_depth": [4, 6, 3],
    "criteion": ["gini", "entropy"]
}

# Creating and fitting RandomSearch
search = RandomSearchCV(DecisionTreeClassifier(), params, 2)
search.fit(X_train, y_train, X_test, y_test)

In [None]:
# Importing Models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

# Importing evaluation modules
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# check the performance on diffrent regressor
models = []
models.append(('Ridge', Ridge()))
models.append(('LinearRegression', LinearRegression()))
models.append(('KNeighborsRegressor', KNeighborsRegressor()))
models.append(('Random Forest', RandomForestRegressor()))
models.append(('Decision Tree', DecisionTreeRegressor()))

train_l = []
test_l = []
mae_l = []
rmse_l = []
r2_l = []

import time
i = 0
for name,model in models:
    i = i+1
    start_time = time.time()
    
    # Fitting model to the Training set
    clf = model
    clf.fit(X_train, y_train)
    
    # Scores of model
    train = model.score(X_train, y_train)
    test = model.score(X_test, y_test)
    
    train_l.append(train)
    test_l.append(test)
    
    # predict values
    predictions = clf.predict(X_test)
    # RMSE
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    rmse_l.append(rmse)
    # MAE
    mae = mean_absolute_error(y_test,predictions)
    mae_l.append(mae)
    # R2 score
    r2 = r2_score(y_test,predictions)
    r2_l.append(r2)



    print("+","="*100,"+")
    print('\033[1m' + f"\t\t\t{i}-For {name} The Performance result is: " + '\033[0m')
    print("+","="*100,"+")
    print('Root mean squared error (RMSE) : ', rmse)   
    print("-"*50)
    print('Mean absolute error (MAE) : ', mae)
    print("-"*50)
    print('Max errors : ', m_errors)
    print("-"*50)
    print('R2 score : ', r2)
    print("-"*50)
    print('cross validation accuracy : ', np.mean(scores))
    print("-"*50)


    
    print("\t\t\t\t\t\t\t-----------------------------------------------------------")
    print(f"\t\t\t\t\t\t\t Time for detection ({name}) : {round((time.time() - start_time), 3)} seconds...")
    print("\t\t\t\t\t\t\t-----------------------------------------------------------")
    print()
    
comp = pd.DataFrame({"Training Score": train_l, "Testing Score": test_l, "MAE": mae_l, "RMSE": rmse_l, "R2 Score": r2_l})
comp

In [2]:
# Importing Models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
# Importing Evaluation matrces
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,\
classification_report, plot_confusion_matrix

# check the performance on diffrent regressor
models = []
models.append(('Support Vector Classifier', svm.SVC()))
models.append(('LogisitcRegression', LogisticRegression()))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
models.append(('RandomForestClassifier', RandomForestClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))


# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)

# metrices to store performance
acc = []
pre = []
f1 = []
con = []
rec = []


import time
i = 0
for name,model in models:
    i = i+1
    start_time = time.time()
    
    # Fitting model to the Training set
    clf = model
    clf.fit(X_train, y_train)
    
    # predict values
    y_pred = clf.predict(X_test)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    acc.append(accuracy)
    # Precision
    precision = precision_score(y_test, y_pred, average=None)
    pre.append(precision)
    # Recall
    recall = recall_score(y_test, y_pred, average=None)
    rec.append(recall)
    # F1 Score
    f1_sco = f1_score(y_test, y_pred, average=None)
    f1.append(f1_sco)
    # Confusion Matrix
    confusion_mat = confusion_matrix(y_test, y_pred)
    con.append(confusion_mat)
    # Report
    report = classification_report(y_test, y_pred)
    
    # evaluate model
    scores = cross_val_score(clf, X, y, cv=cv, n_jobs=-1)



    print("+","="*100,"+")
    print('\033[1m' + f"\t\t\t{i}-For {name} The Performance result is: " + '\033[0m')
    print("+","="*100,"+")
    print('Accuracy : ', accuracy)   
    print("-"*50)
    print('F1 : ', f1_sco)
    print("-"*50)
    print('Reacll : ', recall)
    print("-"*50)
    print('Precision : ', precision)
    print("-"*50)
    print('cross validation accuracy : ', np.mean(scores))
    print("-"*50)
    print('Confusion Matrix....\n', confusion_mat)
    print("-"*50)
    print('Classification Report....\n', report)
    print("-"*50)
    print('Plotting Confusion Matrix...\n')
    plot_confusion_matrix(clf, X_test, y_test)
    plt.show()


    
    print("\t\t\t\t\t\t\t-----------------------------------------------------------")
    print(f"\t\t\t\t\t\t\t Time for detection ({name}) : {round((time.time() - start_time), 3)} seconds...")
    print("\t\t\t\t\t\t\t-----------------------------------------------------------")
    print()
    
pd.DataFrame({"Model": dict(models).keys(), "Accuracy": acc, "Precision": pre, "Recall": rec, "F1_Score": f1, "Confusion Matrix": con})

NameError: name 'X_train' is not defined

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {0: 1, 1: 10}

In [None]:
plt.figure(figsize=(14, 6))
sns.barplot(x=list(dict(models).keys()), y=acc)
plt.title("Model's Accuracies", fontsize=22)
plt.xlabel("Models", fontsize=17)
plt.ylabel("Accuracy", fontsize=17)
plt.show()

In [None]:
models = []
models.append(('LR', LinearRegression()))
models.append(('RFR', RandomForestRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVM', LinearSVR()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure(figsize=(14, 5))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
def ModelSelection(test_data,features,label,dummy_variables_list):
    MLA = [
    
    ensemble.BaggingClassifier(),
    ensemble.GradientBoostingClassifier(),
    RandomForestClassifier(),
           
    XGBClassifier(),
        
    linear_model.LogisticRegressionCV(),
    linear_model.SGDClassifier(),
            
    svm.SVC(probability=True),
        
    tree.DecisionTreeClassifier(),
                
    ]
    
    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Score']
    MLA_compare = pd.DataFrame(columns = MLA_columns)
    features_with_customer_id=features.copy()
    features_with_customer_id.append('uri')
    x_train,x_test,y_train,y_test = train_test_split (test_data[features_with_customer_id],test_data[label],test_size=0.3,random_state=0)
    print('features used: ',features)
    x_train_backup=x_train
    x_test_backup=x_test
    x_train=x_train[features]
    x_test=x_test[features]
    #x_train=pd.get_dummies(x_train, columns=dummy_variables_list)
    #x_test=pd.get_dummies(x_test, columns=dummy_variables_list)
    row_index = 0
    MLA_predict = test_data[label]
    for alg in MLA:

        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        alg.fit(x_train, y_train)
        MLA_predict[MLA_name] = alg.predict(x_test)
        MLA_compare.loc[row_index, 'MLA Score']=alg.score(x_test,y_test)
        row_index+=1

    
    MLA_compare.sort_values(by = ['MLA Score'], ascending = False, inplace = True)
    return MLA_compare,x_train,x_test,y_train,y_test,x_train_backup,x_test_backup

In [None]:
#feature selection
def feature_selection(features,clf,threshold):
    important_features=[]
    feature_importance=[]
    feature_score = pd.DataFrame(columns=['feature','importance_score'])
    
    for feature in zip(features, clf.feature_importances_):      
      feature_score.loc[len(feature_score.index)] = [feature[0],feature[1]]
    
    feature_importance=feature_score.sort_values(by=['importance_score'],ascending=False).reset_index().head(threshold)
    important_features=feature_importance['feature'].to_list()
    return important_features,feature_importance,feature_score

# For K Mean

In [None]:
# Let's find best number of clusters using Elbow method

wcss=[]
number_clusters = range(1,7)
for i in range(1,7):
    kmeans = KMeans(i)
    kmeans.fit(df)
    wcss_iter = kmeans.inertia_
    wcss.append(wcss_iter)

plt.plot(number_clusters,wcss)
plt.title('The Elbow Method', fontsize=18)
plt.xlabel('Number of clusters', fontsize=16)
plt.ylabel('WCSS', fontsize=16)
plt.show()

# In this graph we can see that the elbow is mad at x value 2. That's mean the best number of clusters are 2 for our dataset.
# Let's Build Final Model
model = KMeans(n_clusters = 3, init = "k-means++", random_state = 42)
y_kmeans = model.fit_predict(df)

In [None]:
# save the model to disk
import pickle as pkl

filename = 'finalized_model.sav'
pkl.dump(random_classifier, open(filename, 'wb'))

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))