In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('/Users/muhammadalimiran/Desktop/Engineer Task/fraud_oracle.csv')
print(data.head())


  Month  WeekOfMonth  DayOfWeek    Make AccidentArea DayOfWeekClaimed  \
0   Dec            5  Wednesday   Honda        Urban          Tuesday   
1   Jan            3  Wednesday   Honda        Urban           Monday   
2   Oct            5     Friday   Honda        Urban         Thursday   
3   Jun            2   Saturday  Toyota        Rural           Friday   
4   Jan            5     Monday   Honda        Urban          Tuesday   

  MonthClaimed  WeekOfMonthClaimed     Sex MaritalStatus  ...  AgeOfVehicle  \
0          Jan                   1  Female        Single  ...       3 years   
1          Jan                   4    Male        Single  ...       6 years   
2          Nov                   2    Male       Married  ...       7 years   
3          Jul                   1    Male       Married  ...   more than 7   
4          Feb                   2  Female        Single  ...       5 years   

  AgeOfPolicyHolder PoliceReportFiled WitnessPresent AgentType  \
0          26 to 30 

In [3]:
# Separate features and target variable
X = data.drop(columns=['FraudFound_P'])
y = data['FraudFound_P']

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Identify numerical columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Initialize LabelEncoder for categorical columns
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns
X_encoded = X.copy()
X_encoded[categorical_columns] = X[categorical_columns].apply(label_encoder.fit_transform)

# Apply feature scaling to numerical columns
scaler = StandardScaler()
X_encoded[numerical_columns] = scaler.fit_transform(X_encoded[numerical_columns])

In [4]:
print(X_encoded)

       Month  WeekOfMonth  DayOfWeek  Make  AccidentArea  DayOfWeekClaimed  \
0          2     1.717545          6     6             1                 6   
1          4     0.164199          6     6             1                 2   
2         10     1.717545          0     6             1                 5   
3          6    -0.612473          2    17             0                 1   
4          4     1.717545          1     6             1                 6   
...      ...          ...        ...   ...           ...               ...   
15415      9     0.940872          0    17             1                 6   
15416      9     1.717545          4    13             1                 1   
15417      9     1.717545          4    17             0                 1   
15418      2    -1.389146          1    17             1                 5   
15419      2    -0.612473          6    17             1                 5   

       MonthClaimed  WeekOfMonthClaimed  Sex  MaritalStatus  ..

In [9]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [10]:
# Initialize the Random Forest classifier
random_forest_model = RandomForestClassifier()

# Fit the model to the training data
random_forest_model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = random_forest_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.937094682230869


In [22]:
# Get feature importances
feature_importances = random_forest_model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Get the top 10 important features
top_10_features = feature_importance_df.head(10)

print(top_10_features)

               Feature  Importance
15        PolicyNumber    0.121053
10                 Age    0.077964
16           RepNumber    0.070961
6         MonthClaimed    0.059850
0                Month    0.058744
2            DayOfWeek    0.049306
3                 Make    0.048382
5     DayOfWeekClaimed    0.044286
1          WeekOfMonth    0.040860
7   WeekOfMonthClaimed    0.040829


: 

In [16]:
# Drop all other features from X_encoded except for the top 10
X_encoded_top_10 = X_encoded[top_10_features['Feature']]
X_encoded_top_10.set_index(y, inplace=True)

print(X_encoded_top_10.head())

              PolicyNumber       Age  RepNumber  MonthClaimed  Month  \
FraudFound_P                                                           
0                -1.731938 -1.397554   0.764540             5      2   
0                -1.731714 -0.434015   1.416743             5      4   
0                -1.731489  0.529523  -0.322464            10     10   
0                -1.731265  1.863653  -0.974666             6      6   
0                -1.731040 -0.952844  -1.192067             4      4   

              DayOfWeek  Make  DayOfWeekClaimed  WeekOfMonth  \
FraudFound_P                                                   
0                     6     6                 6     1.717545   
0                     6     6                 2     0.164199   
0                     0     6                 5     1.717545   
0                     2    17                 1    -0.612473   
0                     1     6                 6     1.717545   

              WeekOfMonthClaimed  
FraudFound_

In [18]:
Data_top_10 = data[top_10_features['Feature']]
Data_top_10.set_index(y, inplace=True)

print(Data_top_10.head())

              PolicyNumber  Age  RepNumber MonthClaimed Month  DayOfWeek  \
FraudFound_P                                                               
0                        1   21         12          Jan   Dec  Wednesday   
0                        2   34         15          Jan   Jan  Wednesday   
0                        3   47          7          Nov   Oct     Friday   
0                        4   65          4          Jul   Jun   Saturday   
0                        5   27          3          Feb   Jan     Monday   

                Make DayOfWeekClaimed  WeekOfMonth  WeekOfMonthClaimed  
FraudFound_P                                                            
0              Honda          Tuesday            5                   1  
0              Honda           Monday            3                   4  
0              Honda         Thursday            5                   2  
0             Toyota           Friday            2                   1  
0              Honda         

In [19]:
X_encoded_top_10.to_csv('numeric_top_10_features.csv')
Data_top_10.to_csv('data_top_10_features.csv')