##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [1]:
#%pip install pandas 
#%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME
#%pip install seaborn 
#%pip install -U scikit-learn
#%pip install tensorflow

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .csv file inside that. A relative path *must* be used when loading data into pandas

In [2]:
# Can have as many cells as you want for code
import pandas as pd
filepath = "./data/catA_train.csv" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [3]:
###...code...###
import os 
os.chdir('C:/Users/alber/OneDrive/Documents')
filepath = 'C:/Users/alber/Downloads/catA_train.csv'

data = pd.read_csv(filepath)
threshold = 0.5  # threshold to remove columns
columns_to_drop = data.columns[data.isnull().mean() > threshold]
data_cleaned = data.drop(columns=columns_to_drop)

# Impute missing values for remaining columns
# For numerical columns, use mean  or median
# For categorical columns, use mode
for column in data_cleaned.columns:
    if data_cleaned[column].dtype == 'object':
        data_cleaned[column].fillna(data_cleaned[column].mode()[0], inplace=True)
    else:
        data_cleaned[column].fillna(data_cleaned[column].median(), inplace=True)

data.isna().sum()
data_cleaned.isna().sum()


LATITUDE                               0
LONGITUDE                              0
AccountID                              0
Company                                0
SIC Code                               0
Industry                               0
8-Digit SIC Code                       0
8-Digit SIC Description                0
Year Found                             0
Entity Type                            0
Parent Company                         0
Parent Country                         0
Ownership Type                         0
Company Description                    0
Company Status (Active/Inactive)       0
Employees (Single Site)                0
Employees (Domestic Ultimate Total)    0
Employees (Global Ultimate Total)      0
Sales (Domestic Ultimate Total USD)    0
Sales (Global Ultimate Total USD)      0
Global Ultimate Company                0
Global Ultimate Country                0
Domestic Ultimate Company              0
Is Domestic Ultimate                   0
Is Global Ultima

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
import os 
os.chdir('C:/Users/alber/OneDrive/Documents')

data = pd.read_csv(filepath)

# Preprocess the data
# Handle missing values, remove or impute
# Assuming missing values are handled as per previous discussion

# Encoding categorical variables
categorical_columns = data.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(sparse=False)
encoded_categorical = one_hot_encoder.fit_transform(data[categorical_columns])
encoded_categorical = pd.DataFrame(encoded_categorical, columns=one_hot_encoder.get_feature_names_out(categorical_columns))
data = data.drop(columns=categorical_columns).join(encoded_categorical)

# Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop(columns=['Sales (Global Ultimate Total USD)']))  # Adjust the column name
scaled_features = pd.DataFrame(scaled_features, columns=data.columns.difference(['Sales (Global Ultimate Total USD)']))

# Splitting the dataset into the Training set and Test set
X = scaled_features
y = data['Sales (Global Ultimate Total USD)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the Neural Network
model = Sequential()
model.add(Dense(units=64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1))  # Single unit for regression output

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Predicting the Test set results
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# You might also want to calculate other metrics such as MAE or R^2 based on your needs


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [None]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''

    result = [] 
    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_csv(filepath)
test_df = test_df.drop(columns=['Sales (Domestic Ultimate Total USD)'])
print(testing_hidden_data(test_df))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!