In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
import pickle
from faker import Faker


In [2]:
df = pd.read_csv('/Users/meghakatiyar/M2M_WIL5/WIL5/Data/synthetic_data_new.csv')
df = df.fillna('None') ## to change NaN values to "None"

In [3]:
df.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,E0000,34,,,Minor,,Pass,7,Moderate
1,E0001,91,,,,,Fail,7,Moderate
2,E0002,191,,,,,Pass,6,Low
3,E0003,143,Minor,,,,Pass,7,Moderate
4,E0004,38,,,,,Pass,6,Low


In [4]:
df=df.drop(columns=['Entity ID'])

#### Econding Categorical data to integer

In [7]:
def encoding(item):
    if item in ['Low']:
        return 0
    elif item in ['Pass', 'Moderate', 'None']:
        return 1
    elif item in ['Minor', 'Fail', 'Within past year', 'Flagged', 'High']:
        return 2
    elif item in ['Within past 1-3 years', 'Major']:
        return 3
    elif isinstance(item, (int, float)) and item < 200:
        return 1
    elif isinstance(item, (int, float)) and 200 <= item <= 500:
        return 2
    else:
        return 3

In [8]:
exclude_columns = ['Total Risk Score', 'Risk Category']
#Encoding all the columns except for risk Score & Risk result
encode_columns = [col for col in df.columns if col not in exclude_columns]

for col in encode_columns:
    df[col] = df[col].apply(encoding)

df.head()

Unnamed: 0,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,1,1,1,2,1,1,7,Moderate
1,1,1,1,1,1,2,7,Moderate
2,1,1,1,1,1,1,6,Low
3,1,2,1,1,1,1,7,Moderate
4,1,1,1,1,1,1,6,Low


#### Train Test Split

In [9]:
# Split data into training and test sets

y = df['Risk Category']
x = df.drop(columns=['Risk Category','Total Risk Score'])

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)

#### Balance Classes


In [10]:
smote = SMOTE(random_state=42)
x_train_resampled,y_train_resampled=smote.fit_resample(x_train,y_train)

#### Model Training -- Random Forest

In [11]:
# Train Random Forest model with best hyperparameters

rfc = RandomForestClassifier()
parameters = {'min_samples_leaf':[1,2,4],'min_samples_split':[2,5,100],'n_estimators':[10,20,30,100]}
rfc_cv=GridSearchCV(rfc,parameters)
rfc_cv.fit(x_train_resampled,y_train_resampled)
print('tuned hyperparameters: (best parameters)',rfc_cv.best_params_)
print('Best Parameters Accuracy score:', rfc_cv.best_score_)


tuned hyperparameters: (best parameters) {'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 20}
Best Parameters Accuracy score: 0.9880386983289358


In [12]:
rfc = RandomForestClassifier(min_samples_leaf=1, min_samples_split= 5, n_estimators= 30)
rfc.fit(x_train_resampled,y_train_resampled)

In [13]:
# Make predictions
y_predict=rfc.predict(x_test)

In [14]:
# Evaluate the model
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

        High       0.89      0.96      0.92        50
         Low       0.98      1.00      0.99       489
    Moderate       1.00      0.97      0.98       461

    accuracy                           0.98      1000
   macro avg       0.96      0.98      0.97      1000
weighted avg       0.98      0.98      0.98      1000



#### Export Model

In [106]:
model_path = '../models/RiskPredictor_RF_V3.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(rfc, file)

#### Loading Model

In [107]:
with open(model_path, 'rb') as file:
    model = pickle.load(file)

#### Preparing the data to predict

In [86]:
df_dashboard = pd.read_csv('../data/synthetic_data_dashboard.csv')
df_dashboard_copy=df_dashboard.copy()
df_dashboard_copy = df_dashboard_copy.fillna('None') ## to change NaN values to "None"

y_dash=df_dashboard_copy['Risk Category']
x_dash=df_dashboard_copy.drop(columns=['Entity ID','Risk Category','Total Risk Score'])


In [87]:
df_dashboard.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,E0000,179,,Within past year,,,Pass,8,Moderate
1,E0001,138,,,,,Pass,6,Low
2,E0002,157,,,,,Pass,6,Low
3,E0003,50,,,,Flagged,Pass,7,Moderate
4,E0004,150,,,,,Pass,6,Low


In [88]:
encode_columns = x_dash.columns  # Assuming all columns in x_dash need encoding
for col in encode_columns:
    x_dash[col] = x_dash[col].apply(encoding)

In [89]:
# Make predictions
y_predict_dash=model.predict(x_dash)


In [90]:
df_dashboard_predicted=df_dashboard.copy()
df_dashboard_predicted = df_dashboard_predicted.fillna('None')

In [91]:
df_dashboard_predicted.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,E0000,179,,Within past year,,,Pass,8,Moderate
1,E0001,138,,,,,Pass,6,Low
2,E0002,157,,,,,Pass,6,Low
3,E0003,50,,,,Flagged,Pass,7,Moderate
4,E0004,150,,,,,Pass,6,Low


In [92]:
df_dashboard_predicted['Predicted Risk Category'] = y_predict_dash

In [93]:
probability = model.predict_proba(x_dash)
# Define the column titles
column_titles = ['High %', 'Low %', 'Moderate %']

# Ensure the length of column_titles matches the number of classes
if len(column_titles) != probability.shape[1]:
    raise ValueError("The length of column_titles does not match the number of classes in the probability array.")

# Add the probabilities to x_dash with custom column titles
for i, title in enumerate(column_titles):
    df_dashboard_predicted[title] = probability[:, i]




In [121]:
df_dashboard_predicted.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Predicted Risk Category,High %,Low %,Moderate %,Risk Score,Phone Number,Address
0,E0000,179,,Within past year,,,Pass,Moderate,0.0,0.0,1.0,2.0,(847)278-8493x90374,"1212 Tanya River\nLake Allisonview, NT B5R 5A8"
1,E0001,138,,,,,Pass,Low,0.0,0.979224,0.020776,1.020776,204-953-2138,"9133 Young Shoal\nSouth Jamie, AB K3K5V5"
2,E0002,157,,,,,Pass,Low,0.0,0.979224,0.020776,1.020776,(718)897-1945x8676,"2104 Miller Fall\nKellytown, MB P2L7N3"
3,E0003,50,,,,Flagged,Pass,Moderate,0.0,0.0,1.0,2.0,3628619105,"763 Mary Extensions Apt. 737\nKristinmouth, NT..."
4,E0004,150,,,,,Pass,Low,0.0,0.979224,0.020776,1.020776,(210)274-0674x1032,"6776 Alexa Unions\nDavidfurt, NU H3T 5M1"


In [125]:
# Define risk score weights based on Predicted Risk Category
risk_score_weights = {
    'High': 3,
    'Moderate': 2,
    'Low': 1
}

# Calculate the weighted risk score for each row
df_dashboard_predicted['Risk Score'] = (
    df_dashboard_predicted['High %'] * risk_score_weights['High'] +
    df_dashboard_predicted['Moderate %'] * risk_score_weights['Moderate'] +
    df_dashboard_predicted['Low %'] * risk_score_weights['Low']
)
# Adjust the risk score based on Annual Clients and Predicted Risk Category
# Higher Annual Clients increase the risk score, but scaled based on Predicted Risk Category
for index, row in df_dashboard_predicted.iterrows():
    if row['Predicted Risk Category'] == 'Moderate':
        # Scale down the impact of Annual Clients for Moderate risk
        df_dashboard_predicted.at[index, 'Risk Score'] += min(row['Annual Clients'] * 0.003, 1.0)   # Adjust coefficient as needed
    elif row['Predicted Risk Category'] == 'Low':
        # Scale down the impact of Annual Clients for Low risk
        df_dashboard_predicted.at[index, 'Risk Score'] += row['Annual Clients'] * 0.002  # Adjust coefficient as needed
    else:
        # High risk category, Annual Clients have a stronger impact
        df_dashboard_predicted.at[index, 'Risk Score'] += row['Annual Clients'] * 0.01  # Adjust coefficient as needed

# Round the Risk Score column to two decimal places
df_dashboard_predicted['Risk Score'] = df_dashboard_predicted['Risk Score'].round(2)

In [123]:
df_dashboard_predicted

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Predicted Risk Category,High %,Low %,Moderate %,Risk Score,Phone Number,Address
0,E0000,179,,Within past year,,,Pass,Moderate,0.000000,0.000000,1.000000,2.90,(847)278-8493x90374,"1212 Tanya River\nLake Allisonview, NT B5R 5A8"
1,E0001,138,,,,,Pass,Low,0.000000,0.979224,0.020776,1.43,204-953-2138,"9133 Young Shoal\nSouth Jamie, AB K3K5V5"
2,E0002,157,,,,,Pass,Low,0.000000,0.979224,0.020776,1.49,(718)897-1945x8676,"2104 Miller Fall\nKellytown, MB P2L7N3"
3,E0003,50,,,,Flagged,Pass,Moderate,0.000000,0.000000,1.000000,2.25,3628619105,"763 Mary Extensions Apt. 737\nKristinmouth, NT..."
4,E0004,150,,,,,Pass,Low,0.000000,0.979224,0.020776,1.47,(210)274-0674x1032,"6776 Alexa Unions\nDavidfurt, NU H3T 5M1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,E4995,677,Major,Within past year,,Flagged,,High,1.000000,0.000000,0.000000,9.77,(890)626-2554x12682,"23786 Kathryn Landing\nNorth Markshire, MB E4A7A5"
4996,E4996,653,,Within past year,Minor,,Pass,High,0.795226,0.000000,0.204774,9.33,8417818271,"5182 Denise Squares Suite 271\nPatrickton, NL ..."
4997,E4997,704,Major,Within past 1-3 years,Minor,Flagged,Pass,High,1.000000,0.000000,0.000000,10.04,9757960695,60964 Stephanie Key Suite 270\nLake Victoriato...
4998,E4998,995,,Within past year,Minor,Flagged,,High,0.977236,0.000000,0.022764,12.93,(488)576-6369x664,"5218 Scott Curve\nShanebury, SK M4E 8A3"


In [97]:
# Count the occurrences of each risk category
risk_category_counts = df_dashboard_predicted['Predicted Risk Category'].value_counts()
print(risk_category_counts)

Predicted Risk Category
Low         2350
Moderate    2349
High         301
Name: count, dtype: int64


#### Add Phone number and Address 

In [98]:
# Function to generate fake phone number
def generate_phone_number():
    fake = Faker()
    return fake.phone_number()

# Function to generate fake Canadian address
def generate_canadian_address():
    fake = Faker('en_CA')
    return fake.address()

In [99]:
# Add new columns with randomly generated values
df_dashboard_predicted['Phone Number'] = [generate_phone_number() for _ in range(len(df_dashboard_predicted))]
df_dashboard_predicted['Address'] = [generate_canadian_address() for _ in range(len(df_dashboard_predicted))]

In [101]:
df_dashboard_predicted=df_dashboard_predicted.drop(columns=['Total Risk Score','Risk Category'])

#### Save output

In [126]:
#Save Output Dataset
# Save to CSV
df_dashboard_predicted.to_csv('../data/df_dashboard_predicted.csv', index=False)