In [20]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [3]:
data = pd.read_csv('/Users/muhammadalimiran/Desktop/Engineer Task/fraud_oracle.csv')
print(data.head())

  Month  WeekOfMonth  DayOfWeek    Make AccidentArea DayOfWeekClaimed  \
0   Dec            5  Wednesday   Honda        Urban          Tuesday   
1   Jan            3  Wednesday   Honda        Urban           Monday   
2   Oct            5     Friday   Honda        Urban         Thursday   
3   Jun            2   Saturday  Toyota        Rural           Friday   
4   Jan            5     Monday   Honda        Urban          Tuesday   

  MonthClaimed  WeekOfMonthClaimed     Sex MaritalStatus  ...  AgeOfVehicle  \
0          Jan                   1  Female        Single  ...       3 years   
1          Jan                   4    Male        Single  ...       6 years   
2          Nov                   2    Male       Married  ...       7 years   
3          Jul                   1    Male       Married  ...   more than 7   
4          Feb                   2  Female        Single  ...       5 years   

  AgeOfPolicyHolder PoliceReportFiled WitnessPresent AgentType  \
0          26 to 30 

In [4]:
# Separate features and target variable
X = data.drop(columns=['FraudFound_P'])
y = data['FraudFound_P']

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Identify numerical columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Initialize LabelEncoder for categorical columns
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns
X_encoded = X.copy()
X_encoded[categorical_columns] = X[categorical_columns].apply(label_encoder.fit_transform)

# Apply feature scaling to numerical columns
scaler = StandardScaler()
X_encoded[numerical_columns] = scaler.fit_transform(X_encoded[numerical_columns])

In [5]:
print(X_encoded)

       Month  WeekOfMonth  DayOfWeek  Make  AccidentArea  DayOfWeekClaimed  \
0          2     1.717545          6     6             1                 6   
1          4     0.164199          6     6             1                 2   
2         10     1.717545          0     6             1                 5   
3          6    -0.612473          2    17             0                 1   
4          4     1.717545          1     6             1                 6   
...      ...          ...        ...   ...           ...               ...   
15415      9     0.940872          0    17             1                 6   
15416      9     1.717545          4    13             1                 1   
15417      9     1.717545          4    17             0                 1   
15418      2    -1.389146          1    17             1                 5   
15419      2    -0.612473          6    17             1                 5   

       MonthClaimed  WeekOfMonthClaimed  Sex  MaritalStatus  ..

In [6]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [7]:
# Initialize the Random Forest classifier
random_forest_model = RandomForestClassifier()

# Fit the model to the training data
random_forest_model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = random_forest_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.9377431906614786


In [8]:
# Get feature importances
feature_importances = random_forest_model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Get the top 10 important features
top_10_features = feature_importance_df.head(10)

print(top_10_features)

               Feature  Importance
15        PolicyNumber    0.122095
10                 Age    0.077925
16           RepNumber    0.068965
0                Month    0.059482
6         MonthClaimed    0.058698
2            DayOfWeek    0.049252
3                 Make    0.048372
5     DayOfWeekClaimed    0.043875
1          WeekOfMonth    0.043657
7   WeekOfMonthClaimed    0.038926


In [9]:
# Drop all other features from X_encoded except for the top 10
X_encoded_top_10 = X_encoded[top_10_features['Feature']]
X_encoded_top_10.set_index(y, inplace=True)

print(X_encoded_top_10.head())

              PolicyNumber       Age  RepNumber  Month  MonthClaimed  \
FraudFound_P                                                           
0                -1.731938 -1.397554   0.764540      2             5   
0                -1.731714 -0.434015   1.416743      4             5   
0                -1.731489  0.529523  -0.322464     10            10   
0                -1.731265  1.863653  -0.974666      6             6   
0                -1.731040 -0.952844  -1.192067      4             4   

              DayOfWeek  Make  DayOfWeekClaimed  WeekOfMonth  \
FraudFound_P                                                   
0                     6     6                 6     1.717545   
0                     6     6                 2     0.164199   
0                     0     6                 5     1.717545   
0                     2    17                 1    -0.612473   
0                     1     6                 6     1.717545   

              WeekOfMonthClaimed  
FraudFound_

In [10]:
Data_top_10 = data[top_10_features['Feature']]
Data_top_10.set_index(y, inplace=True)

print(Data_top_10.head())

              PolicyNumber  Age  RepNumber Month MonthClaimed  DayOfWeek  \
FraudFound_P                                                               
0                        1   21         12   Dec          Jan  Wednesday   
0                        2   34         15   Jan          Jan  Wednesday   
0                        3   47          7   Oct          Nov     Friday   
0                        4   65          4   Jun          Jul   Saturday   
0                        5   27          3   Jan          Feb     Monday   

                Make DayOfWeekClaimed  WeekOfMonth  WeekOfMonthClaimed  
FraudFound_P                                                            
0              Honda          Tuesday            5                   1  
0              Honda           Monday            3                   4  
0              Honda         Thursday            5                   2  
0             Toyota           Friday            2                   1  
0              Honda         

In [11]:
X_encoded_top_10.to_csv('numeric_top_10_features.csv')
Data_top_10.to_csv('data_top_10_features.csv')

In [12]:
# Initialize the Gradient Boosting classifier
gradient_boosting_model = GradientBoostingClassifier()

# Fit the model to the training data
gradient_boosting_model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = gradient_boosting_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.9396887159533074


In [13]:
feature_importances = gradient_boosting_model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Get the top 10 important features
top_10_features = feature_importance_df.head(10)

print(top_10_features)

                Feature  Importance
15         PolicyNumber    0.263060
11                Fault    0.214525
31           BasePolicy    0.144923
28  AddressChange_Claim    0.086379
12           PolicyType    0.047570
17           Deductible    0.036344
10                  Age    0.035234
6          MonthClaimed    0.033969
0                 Month    0.027399
3                  Make    0.025459


In [17]:
# Initialize the XGBoost classifier
xgb_model = XGBClassifier()

# Fit the model to the training data
xgb_model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = xgb_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.9497405966277561


In [18]:
# Get feature importances
feature_importances = xgb_model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Get the top 10 important features
top_10_features = feature_importance_df.head(10)

print(top_10_features)

                Feature  Importance
11                Fault    0.303660
31           BasePolicy    0.149154
0                 Month    0.050021
28  AddressChange_Claim    0.045624
6          MonthClaimed    0.044111
15         PolicyNumber    0.041787
4          AccidentArea    0.026477
12           PolicyType    0.025766
10                  Age    0.022166
17           Deductible    0.021060


In [32]:
# Initialize the CatBoost classifier
catboost_model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.0001, loss_function='Logloss')

# Fit the model to the training data
catboost_model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = catboost_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)

0:	learn: 0.6929986	total: 7.23ms	remaining: 716ms
1:	learn: 0.6928570	total: 14.4ms	remaining: 706ms
2:	learn: 0.6927326	total: 25.5ms	remaining: 824ms
3:	learn: 0.6925865	total: 29.9ms	remaining: 717ms
4:	learn: 0.6924638	total: 36ms	remaining: 684ms
5:	learn: 0.6923398	total: 42.1ms	remaining: 660ms
6:	learn: 0.6922130	total: 46.5ms	remaining: 618ms
7:	learn: 0.6920895	total: 53ms	remaining: 609ms
8:	learn: 0.6919435	total: 58.9ms	remaining: 595ms
9:	learn: 0.6918157	total: 62.9ms	remaining: 566ms
10:	learn: 0.6916791	total: 66.9ms	remaining: 542ms
11:	learn: 0.6915550	total: 72ms	remaining: 528ms
12:	learn: 0.6914319	total: 76ms	remaining: 509ms
13:	learn: 0.6912462	total: 79.9ms	remaining: 491ms
14:	learn: 0.6911238	total: 85.2ms	remaining: 483ms
15:	learn: 0.6910006	total: 91.3ms	remaining: 479ms
16:	learn: 0.6908673	total: 95.2ms	remaining: 465ms
17:	learn: 0.6907220	total: 99.7ms	remaining: 454ms
18:	learn: 0.6905993	total: 103ms	remaining: 440ms
19:	learn: 0.6904549	total: 107

In [35]:

# Get feature importances
feature_importances = catboost_model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Get the top 10 important features
top_10_features = feature_importance_df.head(10)

print(top_10_features)

             Feature  Importance
31        BasePolicy   41.650763
11             Fault   17.515527
13   VehicleCategory   12.428568
12        PolicyType    6.819316
5   DayOfWeekClaimed    1.644312
14      VehiclePrice    1.568753
18      DriverRating    1.539555
30              Year    1.473808
2          DayOfWeek    1.420121
3               Make    1.379358
