## Data Loading and Preparation

In [11]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics  import r2_score, mean_absolute_error, mean_squared_error

In [2]:
# Read the CSV file into a Spark DataFrame
df = spark.read.load('abfss://datalakeforasa@sorgefordatalake.dfs.core.windows.net/DemoDW/FinalVersionOfOurData.csv', format='csv', header=True)

# Convert the Spark DataFrame to a Pandas DataFrame
df = df.toPandas()

# Display the first 10 rows of the Pandas DataFrame
df.head(5)

### Data Cleaning and Transformation

In [3]:
# Drop the 'date' column
df.drop(['date'], axis=1, inplace=True)

# Replace commas with dots and convert to float32 for string columns
for col in df.columns:
    if df[col].dtype == 'object':  # Ensure the column is a string type
        df[col] = df[col].str.replace(',', '.').astype('float32')


### Handling Duplicates

In [4]:
df.duplicated().sum()
df = df.drop_duplicates()
df.shape

### Exploratory Data Analysis: Correlation Matrix

In [5]:
fig = plt.figure(figsize=(18,18))
sns.heatmap(df.corr(), linewidths=0.003, linecolor='white', annot=True)

### Exploratory Data Analysis: Boxplots

In [6]:
for column in df.columns:
    plt.figure()
    sns.boxplot(y = column, data = df)
    plt.title(f'Boxplot of {column}')
    plt.show()

### Split the dataset into features (X) and target variable (y)

In [14]:
X = df.drop(columns=['% Silica Concentrate'])  # Assuming this is the target column
y = df['% Silica Concentrate']

### Split the data into training and testing sets (80% train, 20% test)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature scaling (important for Random Forest)

In [16]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()

In [17]:
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

### Model: Random Forest

In [18]:
model = RandomForestRegressor(
    n_estimators=300,  # Number of trees
    max_depth=15,       # Limit the depth of each tree
    min_samples_split=6, # Minimum number of samples required to split a node
    min_samples_leaf=3,  # Minimum number of samples required to be at a leaf node
    max_features='sqrt'  # Number of features to consider when looking for the best split
)

In [19]:
model.fit(X_train_scaled,y_train_scaled)

### Evaluate the model using R-squared score

In [20]:
Predicted=model.predict(X_test_scaled)

In [21]:
mse = mean_squared_error(y_test_scaled, Predicted)
r2 = r2_score(y_test_scaled, Predicted)

print(f'Mean Squared Error: {mse:.2f}')

print(f'R² Score: {r2:.2f}')