In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.model_selection import train_test_split
import sklearn.metrics as metric
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
import sklearn.preprocessing as pre


import warnings
warnings.filterwarnings("ignore")

import wrangle as w
import explore as e
import model as m

from importlib import reload

In [None]:
def get_explore_data():
    ''' 
    This function reads in a csv held in the same repository folder
    '''
    df = pd.read_csv('train_data.csv').drop('index', axis='columns')
    return df

In [None]:
def prep_data(df, features=[]):
    '''
    This function pulls in the defined features as the only features 
    to be represented as columns in the resulting dataframe
    '''
    df['startdate'] = pd.to_datetime(df['startdate'])
    if len(features) == 0:
        return df
    else:
        return df[features]

def create_region_bins(df):
    '''
    This function creates a new column that holds
    three categorical variables dry, temperate, and continental
    that represents the bins we put the 15 original regions into
    based on the first letter of their Koppen-Geiger code
    '''
    df['region_bins'] = df.region.replace({'BWh' :'Dry', 'BWk' :'Dry', 'BSh':'Dry', 'BSk':'Dry',
                                        'Csa':'Temperate', 'Csb':'Temperate', 'Cfa':'Temperate', 'Cfb':'Temperate',
                                        'Dsb':'Continental', 'Dsc':'Continental', 'Dwa':'Continental', 'Dwb':'Continental', 'Dfa':'Continental', 'Dfb':'Continental', 'Dfc':'Continental'})

    return df


def create_elevation_bins(df):
    '''
    Function creates four bins of elevation based
    on mathematical quantiles.
    '''
    names = ['bottom_low', 'top_low', 'mid', 'high']
    df['elevation_range'] = pd.qcut(df['elevation'], 4, labels=names)
    
    return df

    
def create_season_bins(df):
    '''
    Creates a column for month and then bases that to create bins for seasons
    '''
    #create month column
    df['month']=df['startdate'].dt.month
    
    #define season groups
    season_groups = {
        "Autumn": [9,10,11],
        "Winter": [12,1,2],
        "Spring": [3,4,5],
        "Summer": [6,7,8],
    }
    #add season onto the df
    df["season"] = (
        df["month"]
        .apply(lambda x: [k for k in season_groups.keys() if x in season_groups[k]])
        .str[0]
        .fillna("Other")
    )
    return df

def rename_data(df):
    '''
    This function takes in the dataframe and returns all columns
    with only the listed columns names changed to be more readable.
    '''
    df=df.rename(columns={'climateregions__climateregion': 'region', 
                                      'elevation__elevation': 'elevation',
                                      'contest-pevpr-sfc-gauss-14d__pevpr':'potential_evap',
                                      'contest-precip-14d__precip':'precip',
                                      'contest-pres-sfc-gauss-14d__pres':'barometric_pressure',
                                      'contest-prwtr-eatm-14d__prwtr':'all_atmos_precip',
                                      'contest-rhum-sig995-14d__rhum':'relative_humidity',
                                      'contest-slp-14d__slp':'sea_level_press',
                                      'contest-tmp2m-14d__tmp2m':'mean_temp',
                                      'contest-wind-h10-14d__wind-hgt-10':'height_10_mb',
                                      'contest-wind-h100-14d__wind-hgt-100':'height_100_mb',
                                      'contest-wind-h500-14d__wind-hgt-500':'height_500_mb',
                                      'contest-wind-h850-14d__wind-hgt-850':'height_850_mb',
                                      'contest-wind-uwnd-250-14d__wind-uwnd-250':'zonal_wind_250mb',
                                      'contest-wind-uwnd-925-14d__wind-uwnd-925':'zonal_wind_925mb',
                                      'contest-wind-vwnd-250-14d__wind-vwnd-250':'long_wind_250mb',
                                      'contest-wind-vwnd-925-14d__wind-vwnd-925':'long_wind_925mb'
                                     })
    return df

def deal_with_nulls(df):
    '''
    Calculates nulls based on other features in the df
    and drops one that can't be computed
    '''
    #calculate the columns with nulls
    count_na_df = df.isna().sum()
    null_cols = list(count_na_df[count_na_df > 0].index)

    #means of each nmme measurement
    g_means =  ['nmme0-tmp2m-34w__nmme0mean', 
    'nmme-tmp2m-56w__nmmemean', 
    'nmme-prate-34w__nmmemean', 
    'nmme0-prate-56w__nmme0mean', 
    'nmme0-prate-34w__nmme0mean', 
    'nmme-prate-56w__nmmemean', 
    'nmme-tmp2m-34w__nmmemean']

    #measurements we have already
    g_1 = ['nmme0-tmp2m-34w__cancm30',
    'nmme0-tmp2m-34w__cancm40',
    'nmme0-tmp2m-34w__ccsm40',
    'nmme0-tmp2m-34w__cfsv20',
    'nmme0-tmp2m-34w__gfdlflora0',
    'nmme0-tmp2m-34w__gfdlflorb0',
    'nmme0-tmp2m-34w__gfdl0',
    'nmme0-tmp2m-34w__nasa0']

    g_2 = ['nmme-tmp2m-56w__cancm3',
    'nmme-tmp2m-56w__cancm4',
    'nmme-tmp2m-56w__ccsm4',
    'nmme-tmp2m-56w__cfsv2',
    'nmme-tmp2m-56w__gfdl',
    'nmme-tmp2m-56w__gfdlflora',
    'nmme-tmp2m-56w__gfdlflorb',
    'nmme-tmp2m-56w__nasa']

    g_3 = ['nmme-prate-34w__cancm3',
    'nmme-prate-34w__cancm4',
    'nmme-prate-34w__ccsm4',
    'nmme-prate-34w__cfsv2',
    'nmme-prate-34w__gfdl',
    'nmme-prate-34w__gfdlflora',
    'nmme-prate-34w__gfdlflorb',
    'nmme-prate-34w__nasa']

    g_4 = [ 'nmme0-prate-56w__cancm30',
    'nmme0-prate-56w__cancm40',
    'nmme0-prate-56w__ccsm40',
    'nmme0-prate-56w__cfsv20',
    'nmme0-prate-56w__gfdlflora0',
    'nmme0-prate-56w__gfdlflorb0',
    'nmme0-prate-56w__gfdl0',
    'nmme0-prate-56w__nasa0']

    g_5 = ['nmme0-prate-34w__cancm30',
    'nmme0-prate-34w__cancm40',
    'nmme0-prate-34w__ccsm40',
    'nmme0-prate-34w__cfsv20',
    'nmme0-prate-34w__gfdlflora0',
    'nmme0-prate-34w__gfdlflorb0',
    'nmme0-prate-34w__gfdl0',
    'nmme0-prate-34w__nasa0']

    g_6 = ['nmme-prate-56w__cancm3',
    'nmme-prate-56w__cancm4',
    'nmme-prate-56w__ccsm4',
    'nmme-prate-56w__cfsv2',
    'nmme-prate-56w__gfdl',
    'nmme-prate-56w__gfdlflora',
    'nmme-prate-56w__gfdlflorb',
    'nmme-prate-56w__nasa']

    g_7 = ['nmme-tmp2m-34w__cancm3',
    'nmme-tmp2m-34w__cancm4',
    'nmme-tmp2m-34w__ccsm4',
    'nmme-tmp2m-34w__cfsv2',
    'nmme-tmp2m-34w__gfdl',
    'nmme-tmp2m-34w__gfdlflora',
    'nmme-tmp2m-34w__gfdlflorb',
    'nmme-tmp2m-34w__nasa']

    #make a list of lists
    gs = [g_1, g_2, g_3, g_4, g_5, g_6, g_7]

    #go through and compute all features where there were nulls
    zip_cols = zip(null_cols, gs, g_means)
    for c, g, s in zip_cols:
        df[c] = (df[s]*9) - df[g].sum(1)

    #drop column we can't compute
    df = df.drop(columns='ccsm30')

    return df


################################# SUM OF PREPARATION 
def get_contest_data(df):
    '''
    This function takes in a dataframe and runs it 
    through the processes of superior preparation functions.
    '''
    df = prep_data(df, features=features)
    df = rename_data(df)
    df = create_elevation_bins(df)
    df = create_region_bins(df)

    #df = df.drop(columns=['elevation','region']
    return df
    
################################################################## SPLITTING DATA
def split_data(df, test_size=0.15):
    '''
    Takes in a data frame and the train size
    It returns train, validate , and test data frames
    with validate being 0.05 bigger than test and train has the rest of the data.
    '''
    train, test = train_test_split(df, test_size = test_size , random_state=27)
    train, validate = train_test_split(train, test_size = (test_size + 0.05)/(1-test_size), random_state=27)
    
    return train, validate, test

In [None]:
# acquiring data
df = w.get_explore_data()

# prepping data
df = w.get_contest_data(df)

# splitting data into train, validate, and test
train, validate, test = w.split_data(df)

In [None]:
def data_distribution(df):
    #set font size
    sns.set(font_scale=1.5)

    fig, ax = plt.subplots(figsize=(20, 8))
    sns.histplot(data=df, x='mean_temp')
    plt.xlabel('Mean Temp for Next 14 days')
    plt.title('Distribution of our Target Variable');
    
def region_viz(df):
    fig, ax = plt.subplots(1, 2, figsize=(20, 8))

    fig.suptitle('Is there a relationship between climate region and the mean temperature?')
    
    sns.countplot(x='region', data=df, ax=ax[0])
    ax[0].set_title('Distribution of Regions')

    sns.barplot(x='region', y='mean_temp', data=df, ax=ax[1])
    rate = df['mean_temp'].mean()
    ax[1].set_title('Mean temp across regions')
    ax[1].axhline(rate,  label = f'Average Temp Across All Regions {rate:.2f}', linestyle='dotted', color='black')
    ax[1].legend()
    plt.show()

def region_stats_test(df):
    dry = df[df.region_bins == 'Dry']
    temp = df[df.region_bins == 'Temperate']
    cont = df[df.region_bins == 'Continental']
    
    corr, p = stats.kruskal(dry.mean_temp, temp.mean_temp, cont.mean_temp)
    
    print(f'p-value: {p}')
    

def elevation_bin_viz(df):
    fig, ax = plt.subplots(1, 2, figsize=(20, 8))

    fig.suptitle('Is there a relationship between elevation_range and the mean temperature?')
    
    sns.countplot(x='elevation_range', data=df, ax=ax[0])
    ax[0].set_title('Distribution of Elevation')

    sns.barplot(x='elevation_range', y='mean_temp', data=df, ax=ax[1])
    rate = df['mean_temp'].mean()
    ax[1].set_title('Mean temp across elevation bins')
    ax[1].axhline(rate,  label = f'Average Temp Across All Elevations {rate:.2f}', linestyle='dotted', color='black')
    ax[1].legend()
    plt.show()
    
    
def elevation_bin_kruskal_test(df):
    bl = df[df.elevation_range == 'bottom_low']
    tl = df[df.elevation_range == 'top_low']
    mid = df[df.elevation_range == 'mid']
    h = df[df.elevation_range == 'high']

    corr, p = stats.kruskal(bl.mean_temp, tl.mean_temp, mid.mean_temp, h.mean_temp)
    
    print(f'p-value: {p}')


def elevation_bin_dist_viz(df):
    bl = df[df.elevation_range == 'bottom_low']
    tl = df[df.elevation_range == 'top_low']
    mid = df[df.elevation_range == 'mid']
    h = df[df.elevation_range == 'high']
    
    fig, ax = plt.subplots(1, 4, figsize=(20, 8))

    fig.suptitle('Distribution of Observations by Elevation Bin')

    sns.histplot(x='mean_temp', data= bl, ax=ax[0])
    ax[0].set_title('Bottom Low')
    ax[0].set_ylim(0,2000)
    ax[0].set_xlim(-20,40)
    ax[0].set_ylabel('')

    sns.histplot(x='mean_temp', data= tl, ax=ax[1])
    ax[1].set_title('Top Low')
    ax[1].set_ylim(0,2000)
    ax[1].set_xlim(-20,40)
    ax[1].set_ylabel('')

    sns.histplot(x='mean_temp', data= mid, ax=ax[2])
    ax[2].set_title('Mid')
    ax[2].set_ylim(0,2000)
    ax[2].set_xlim(-20,40)
    ax[2].set_ylabel('')

    sns.histplot(x='mean_temp', data= h, ax=ax[3])
    ax[3].set_title('High')
    ax[3].set_ylim(0,2000)
    ax[3].set_xlim(-20,40)
    ax[3].set_ylabel('')

    plt.show()
    
    
def precipitation_viz(df):
    fig, ax = plt.subplots(1, 2, figsize=(20, 8))

    sns.histplot(data=df, x ='precip', ax=ax[0])
    ax[0].set_title('Distribution of Precipitation')

    sns.regplot(x='mean_temp', y='precip', data=df, line_kws={'color': 'red'}, ax=ax[1])
    ax[1].set_title('Is there a correlation between mean temp and precipitation?')
    rate = df['precip'].mean()
    ax[1].axhline(rate,  label = f'Overall Mean Precipitation: {rate:.2f}', linestyle='dotted', color='black')
    ax[1].legend()
    plt.show()
    
def precip_spearmanr_test(df):
    corr, p = stats.spearmanr(df['precip'], df['mean_temp'])
    print(f'p-value: {p}')

    
def potential_evap_viz(df):
    fig, ax = plt.subplots(1, 2, figsize=(20, 8))

    sns.histplot(data=df, x ='potential_evap', ax=ax[0])
    ax[0].set_title('Distribution of Potential Evaporation')

    sns.regplot(x='mean_temp', y='potential_evap', data=df, line_kws={'color': 'red'}, ax=ax[1])
    ax[1].set_title('Is there a correlation between mean temp and potential evaporation?')
    rate = df['potential_evap'].mean()
    ax[1].axhline(rate,  label = f'Overall Mean Potential Evaporation: {rate:.2f}', linestyle='dotted', color='black')
    ax[1].legend()
    ax[1].set_ylim(-50,1200)
    plt.show()
    
def potential_evap_spearmanr_test(df):
    corr, p = stats.spearmanr(df['potential_evap'], df['mean_temp'])
    print(f'p-value: {p}')
    
def geopotential_viz(df):
    rows = df.sample(frac =.01)
    
    fig, ax = plt.subplots(2, 2, figsize=(20, 20))

    fig.suptitle('Is there a correlation between mean temp and geopotential pressure at different heights?')

    sns.scatterplot(x='height_10_mb', y='mean_temp', hue='region_bins', data= rows, ax=ax[0][0])
    ax[0][0].set_title('At 10 Milibars')
    ax[0][0].legend()

    sns.scatterplot(x='height_100_mb', y='mean_temp', hue='region_bins', data= rows, ax=ax[0][1])
    ax[0][1].set_title('At 100 Milibars')
    ax[0][1].legend()

    sns.scatterplot(x='height_500_mb', y='mean_temp', hue='region_bins', data= rows, ax=ax[1][0])
    ax[1][0].set_title('At 500 Milibars')
    ax[1][0].legend()

    sns.scatterplot(x='height_850_mb', y='mean_temp', hue='region_bins', data= rows, ax=ax[1][1])
    ax[1][1].set_title('At 850 Milibars')
    ax[1][1].legend()

    plt.show()

In [None]:
def scale_data(train, validate, test):
    '''
    Takes in train, validate, test and a list of features to scale
    and scales those features.
    Returns df with new columns with scaled data
    '''
    scale_features= list(train.select_dtypes(include=np.number).columns)
    
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    
    minmax = pre.MinMaxScaler()
    minmax.fit(train[scale_features])
    
    train_scaled[scale_features] = pd.DataFrame(minmax.transform(train[scale_features]),
                                                  columns=train[scale_features].columns.values).set_index([train.index.values])
                                                  
    validate_scaled[scale_features] = pd.DataFrame(minmax.transform(validate[scale_features]),
                                               columns=validate[scale_features].columns.values).set_index([validate.index.values])
    
    test_scaled[scale_features] = pd.DataFrame(minmax.transform(test[scale_features]),
                                                 columns=test[scale_features].columns.values).set_index([test.index.values])
    
    return train_scaled, validate_scaled, test_scaled

def prep_for_model(train, validate, test, target, drivers):
    '''
    Takes in train, validate, and test data frames
    then splits  for X (all variables but target variable) 
    and y (only target variable) for each data frame
    '''
    #scale data
    train_scaled, validate_scaled, test_scaled = scale_data(train, validate, test)
    
    X_train = train_scaled[drivers]
    
    #make list of cat variables to make dummies for
    cat_vars = list(X_train.select_dtypes(exclude=np.number).columns)

    dummy_df_train = pd.get_dummies(X_train[cat_vars], dummy_na=False, drop_first=[True, True])
    X_train = pd.concat([X_train, dummy_df_train], axis=1).drop(columns=cat_vars)
    y_train = train[target]

    X_validate = validate_scaled[drivers]
    dummy_df_validate = pd.get_dummies(X_validate[cat_vars], dummy_na=False, drop_first=[True, True])
    X_validate = pd.concat([X_validate, dummy_df_validate], axis=1).drop(columns=cat_vars)
    y_validate = validate[target]

    X_test = test_scaled[drivers]
    dummy_df_test = pd.get_dummies(X_test[cat_vars], dummy_na=False, drop_first=[True, True])
    X_test = pd.concat([X_test, dummy_df_test], axis=1).drop(columns=cat_vars)
    y_test = test[target]

    return X_train, y_train, X_validate, y_validate, X_test, y_test

def regression_models(X_train, y_train, X_validate, y_validate):
    '''
    Takes in X_train, y_train, X_validate, y_validate and runs 
    different models and produces df with RMSE and r^2 scores
    for each model on train and validate.
    '''
    
    train_predictions = pd.DataFrame(y_train)
    validate_predictions = pd.DataFrame(y_validate)

    # create the metric_df as a blank dataframe
    metric_df = pd.DataFrame() 

    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train)
    train_predictions['lm'] = lm.predict(X_train)
    # predict validate
    validate_predictions['lm'] = lm.predict(X_validate)
    metric_df = make_metric_df(y_train, train_predictions.lm, y_validate, validate_predictions.lm, metric_df, model_name = 'OLS Regressor')

    #Lasso Lars
    # create the model object
    lars = LassoLars(alpha=1)
    lars.fit(X_train, y_train)
    # predict train
    train_predictions['lars'] = lars.predict(X_train)
    # predict validate
    validate_predictions['lars'] = lars.predict(X_validate)
    metric_df = make_metric_df(y_train, train_predictions.lars, y_validate, validate_predictions.lars, metric_df, model_name = 'Lasso_alpha_1')

    # make the polynomial features to get a new set of features
    pf = PolynomialFeatures(degree=2)
    # fit and transform X_train_scaled
    X_train_degree2 = pf.fit_transform(X_train)
    # transform X_validate_scaled & X_test_scaled
    X_validate_degree2 = pf.transform(X_validate)
    # create the model object
    lm2 = LinearRegression(normalize=True)
    lm2.fit(X_train_degree2, y_train)
    # predict train
    train_predictions['poly_2'] = lm2.predict(X_train_degree2)
    # predict validate
    validate_predictions['poly_2'] = lm2.predict(X_validate_degree2)
    metric_df = make_metric_df(y_train, train_predictions.poly_2, y_validate, validate_predictions.poly_2, metric_df, model_name = 'Quadratic')

    return metric_df

def make_metric_df(y_train, y_train_pred, y_validate, y_validate_pred,  metric_df,model_name ):
    '''
    Takes in y_train, y_train_pred, y_validate, y_validate_pred, and a df
    returns a df of RMSE and r^2 score for the model on train and validate
    '''
    if metric_df.size ==0:
        metric_df = pd.DataFrame(data=[
            {
                'model': model_name, 
                f'RMSE_train': metric.mean_squared_error(
                    y_train,
                    y_train_pred) ** .5,
                f'RMSE_validate': metric.mean_squared_error(
                    y_validate,
                    y_validate_pred) ** .5
            }])
        return metric_df
    else:
        return metric_df.append(
            {
                'model': model_name, 
                f'RMSE_train': metric.mean_squared_error(
                    y_train,
                    y_train_pred) ** .5,
                f'RMSE_validate': metric.mean_squared_error(
                    y_validate,
                    y_validate_pred) ** .5
            }, ignore_index=True)

def baseline_models(y_train, y_validate):
    '''
    Takes in y_train and y_validate and returns a df of 
    baseline_mean and baseline_median and how they perform
    '''
    train_predictions = pd.DataFrame(y_train)
    validate_predictions = pd.DataFrame(y_validate)
    
    y_pred_mean = y_train.mean()
    train_predictions['y_pred_mean'] = y_pred_mean
    validate_predictions['y_pred_mean'] = y_pred_mean


    # create the metric_df as a blank dataframe
    metric_df = pd.DataFrame(data=[
    {
        'model': 'mean_baseline', 
        'RMSE_train': metric.mean_squared_error(
            y_train,
            train_predictions['y_pred_mean']) ** .5,
        'RMSE_validate': metric.mean_squared_error(
            y_validate,
            validate_predictions['y_pred_mean']) ** .5,
    }])

    return metric_df

def best_model(X_train, y_train, X_validate, y_validate, X_test, y_test):
    '''
    Takes in X_train, y_train, X_validate, y_validate, X_test, y_test
    and returns a df with the RMSE and r^2 score on train, validate and test
    '''    
    # make the polynomial features to get a new set of features
    pf = PolynomialFeatures(degree=2)
    # fit and transform X_train_scaled
    X_train_degree2 = pf.fit_transform(X_train)
    # transform X_validate_scaled & X_test_scaled
    X_validate_degree2 = pf.transform(X_validate)
    X_test_degree2 = pf.transform(X_test)
    # create the model object
    lm2 = LinearRegression(normalize=True)
    lm2.fit(X_train_degree2, y_train)
    
    metric_df = pd.DataFrame(data=[
            {
                'model': 'Quadratic', 
                f'RMSE_train': metric.mean_squared_error(
                    y_train,
                    lm2.predict(X_train_degree2)) ** .5,
                f'RMSE_validate': metric.mean_squared_error(
                    y_validate,
                    lm2.predict(X_validate_degree2)) ** .5,
                f'RMSE_test': metric.mean_squared_error(
                    y_test,
                    lm2.predict(X_test_degree2)) ** .5,
            }])
    
    return metric_df


def scale_kaggle_data(train, test, target):
    '''
    Takes in train/test dfs from kaggle and target.
    Returns dfs with continuous variables scaled
    '''
    scale_features= list(train.select_dtypes(include=np.number).columns)
    scale_features.remove(target)
    
    train_scaled = train.copy()
    test_scaled = test.copy()
    
    minmax = pre.MinMaxScaler()
    minmax.fit(train[scale_features])
    
    train_scaled[scale_features] = pd.DataFrame(minmax.transform(train[scale_features]),
                                                  columns=train[scale_features].columns.values).set_index([train.index.values])
    
    test_scaled[scale_features] = pd.DataFrame(minmax.transform(test[scale_features]),
                                                 columns=test[scale_features].columns.values).set_index([test.index.values])
    
    return train_scaled, test_scaled

def prep_for_kaggle(train, test, target, drivers):
    '''
    Takes in train/test dfs from kaggle, target variable, and list of drivers
    then splits  for X (all variables but target variable) 
    and y (only target variable) for each data frame
    '''
    #scale data
    train_scaled, test_scaled = scale_kaggle_data(train, test, target)
    
    X_train = train_scaled[drivers]
    
    #make list of cat variables to make dummies for
    cat_vars = list(X_train.select_dtypes(exclude=np.number).columns)

    dummy_df_train = pd.get_dummies(X_train[cat_vars], dummy_na=False, drop_first=[True, True])
    X_train = pd.concat([X_train, dummy_df_train], axis=1).drop(columns=cat_vars)
    y_train = train[target]

    X_test = test_scaled[drivers]
    dummy_df_test = pd.get_dummies(X_test[cat_vars], dummy_na=False, drop_first=[True, True])
    X_test = pd.concat([X_test, dummy_df_test], axis=1).drop(columns=cat_vars)
    #y_test = test[target]

    return X_train, y_train, X_test

In [None]:
from sklearn.model_selection import KFold

# Define the number of folds for the cross-validation
n_folds = 5

# Split the data into n_folds parts
kf = KFold(n_splits=n_folds)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model on the training data and evaluate it on the test data
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the categorical data into numeric form
encoder = LabelEncoder()
X_encoded = X.apply(encoder.fit_transform)

# Initialize a list to store the predictions
y_pred = []

# Iterate over each row of the encoded data
for i in range(X_encoded.shape[0]):
    X_temp = X_encoded.drop(i, axis=0)
    y_temp = y.drop(i, axis=0)

    # Train the model on all but one row of the data
    model.fit(X_temp, y_temp)

    # Predict the target variable for the left-out row
    y_pred.append(model.predict(X_encoded.iloc[i].values.reshape(1, -1)))
