##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [76]:
#%pip install pandas 
#%pip install matplotlib
%pip install scikit-learn
%pip install seaborn
%pip install imbalanced-learn

# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [77]:
# Can have as many cells as you want for code
import pandas as pd
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

In [78]:
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_parquet(filepath, engine='pyarrow')

def data_cleaning(df):
    
    #missing value
    ##remove 
    thres = 75
    for col in df.columns:
        if col == 'f_purchase_lh':
            continue;
        percent = df[col].isna().sum()/df.shape[0]*100
        if percent > thres:
            df = df.drop(columns = col)
    
    ##impute
    
    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
    numeric_cols = numeric_cols[numeric_cols !=  'f_purchase_lh']
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))
    
    object_imputer = SimpleImputer(missing_values = None, strategy='most_frequent')
    object_cols_with_missing = df.columns[df.isnull().any() & (df.columns != 'f_purchase_lh')]
    df[object_cols_with_missing] = object_imputer.fit_transform(df[object_cols_with_missing])
    return df

In [79]:
from sklearn.preprocessing import OrdinalEncoder

def encode(df):

    #ordinal encode
    object_columns = df.select_dtypes('object').columns
    ordinal_encoder = OrdinalEncoder()
    for col in object_columns:
         df[col] = ordinal_encoder.fit_transform(df[[col]])

    return df


### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [80]:
###...code...###

In [81]:
#feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

def imbalance(df):
    df['f_purchase_lh'] = df['f_purchase_lh'].fillna(-999)
    X = df.drop('f_purchase_lh', axis=1)
    y = df['f_purchase_lh']
    target_column = 'f_purchase_lh'
    #handle imbalance
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name=target_column)], axis=1)
    return df_resampled

def feature_selection(df_resampled):
    X_resampled=df_resampled.drop('f_purchase_lh', axis=1)
    
    # Variance Threshold
    selector = VarianceThreshold(threshold=0.05) 
    X_resampled_high_variance = selector.fit_transform(X_resampled)
    
    selected_indices = selector.get_support(indices=True)
    selected_df = pd.DataFrame(X_resampled_high_variance, columns=X_resampled.columns[selected_indices])
    selected_df['f_purchase_lh'] = df_resampled['f_purchase_lh']
    return selected_df

In [82]:
#Implementing Logistic Regression using sklearn
from sklearn.preprocessing import StandardScaler
def scale_and_train(selected_df):
    X = selected_df.drop('f_purchase_lh', axis=1)
    y = selected_df['f_purchase_lh']
    x_train, x_test, y_train, y_test = train_test_split(X,y, test_size= 0.2)
    modelLogistic = LogisticRegression(solver='lbfgs', max_iter=1000)
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    modelLogistic.fit(x_train_scaled,y_train)
    
    # print("The intercept b0= ", modelLogistic.intercept_)
    
    # print("The coefficient b1= ", modelLogistic.coef_)
    #Make prediction for the test data
    y_pred= modelLogistic.predict(x_test_scaled)
    return y_pred

In [83]:
#accuracy test
from sklearn.metrics import confusion_matrix, accuracy_score
def test(y_test, y_pred):
    ConfusionMatrix = confusion_matrix(y_test, y_pred)
    TP= ConfusionMatrix[1,1] #True positive
    TN= ConfusionMatrix[0,0] #True negative
    Total=len(y_test)
    return accuracy_score(y_test, y_pred)
    # return (TN+TP)/Total

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [84]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    df = data_cleaning(hidden_data)
    df = encode(df)
    df_resampled = imbalance(df)
    selected_df = feature_selection(df_resampled)
    
    result = [] 
    #scale and train data 
    X = selected_df.drop('f_purchase_lh', axis=1)
    y = selected_df['f_purchase_lh']
    x_train, x_test, y_train, y_test = train_test_split(X,y, test_size= 0.2)
    modelLogistic = LogisticRegression(solver='lbfgs', max_iter=1000)
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    modelLogistic.fit(x_train_scaled,y_train)
   
    #Make prediction for the test data
    y_pred = modelLogistic.predict(x_test_scaled)
    accuracy = test(y_test, y_pred)
    print("Accuracy from confusion matrix is ", accuracy)
    result = y_pred.tolist()

    return result

##### Cell to check testing_hidden_data function

In [86]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
print(testing_hidden_data(test_df))

[[3108  304]
 [ 485 3016]]
0.8858672067119919
Accuracy from confusion matrix is  0.8858672067119919
[1.0, 1.0, -999.0, -999.0, -999.0, -999.0, -999.0, 1.0, 1.0, 1.0, 1.0, -999.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -999.0, -999.0, 1.0, 1.0, 1.0, 1.0, -999.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -999.0, 1.0, 1.0, 1.0, 1.0, -999.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -999.0, 1.0, 1.0, -999.0, -999.0, -999.0, 1.0, 1.0, -999.0, 1.0, 1.0, -999.0, 1.0, -999.0, 1.0, 1.0, 1.0, -999.0, 1.0, 1.0, 1.0, -999.0, 1.0, 1.0, 1.0, -999.0, -999.0, -999.0, 1.0, -999.0, 1.0, 1.0, -999.0, -999.0, -999.0, 1.0, -999.0, 1.0, 1.0, 1.0, -999.0, 1.0, -999.0, -999.0, -999.0, 1.0, -999.0, -999.0, 1.0, -999.0, 1.0, -999.0, -999.0, -999.0, 1.0, 1.0, -999.0, 1.0, 1.0, -999.0, -999.0, 1.0, -999.0, -999.0, -999.0, 1.0, 1.0, -999.0, -999.0, -999.0, -999.0, -999.0, -999.0, -999.0, 1.0, -999.0, -999.0, 1.0, 1.0, -999.0, -999.0, 1.0, -999.0, 1.0, 1.0, -999.0, 1.0, 1.0, -999.0, -999.0, -999.0, -999.0, 1.0, -999.0, 1.0, 1.0, 1.0, -99

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!