## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv("Domestic violence.csv")

In [3]:
data.head(2)

Unnamed: 0,SL. No,Age,Education,Employment,Income,Marital status,Violence
0,1,30,secondary,unemployed,0,married,yes
1,2,47,tertiary,unemployed,0,married,no


## Taking care of duplicate observations over here


In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here


In [5]:
data.isnull().sum()

SL. No             0
Age                0
Education          0
Employment         0
Income             0
Marital status     0
Violence           0
dtype: int64

## Filtering all the numerical features over here


In [6]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

SL. No
Age
Income


In [7]:
data[numerical_features]

Unnamed: 0,SL. No,Age,Income
0,1,30,0
1,2,47,0
2,3,24,0
3,4,22,0
4,5,50,0
...,...,...,...
342,343,27,30000
343,344,26,35000
344,345,27,0
345,346,29,0


## Filtering all the categorical features over here


In [8]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Education 
Employment 
Marital status 
Violence 


In [9]:
data[cat_features]

Unnamed: 0,Education,Employment,Marital status,Violence
0,secondary,unemployed,married,yes
1,tertiary,unemployed,married,no
2,tertiary,unemployed,unmarred,no
3,tertiary,unemployed,unmarred,no
4,primary,unemployed,married,yes
...,...,...,...,...
342,tertiary,semi employed,unmarred,no
343,tertiary,employed,married,no
344,primary,unemployed,married,yes
345,primary,unemployed,married,no


## Encoding the categorical features into numerical features over here

In [10]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [11]:
data

Unnamed: 0,SL. No,Age,Education,Employment,Income,Marital status,Violence
0,1,30,0,0,0,0,0
1,2,47,1,0,0,0,1
2,3,24,1,0,0,1,1
3,4,22,1,0,0,1,1
4,5,50,2,0,0,0,0
...,...,...,...,...,...,...,...
342,343,27,1,1,30000,1,1
343,344,26,1,2,35000,0,1
344,345,27,2,0,0,0,0
345,346,29,2,0,0,0,1


## Creating the features and labels over here

In [12]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values


## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the Classification model on the training dataset over here

In [18]:
# from sklearn.ensemble import RandomForestClassifier
# classifier=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
# classifier.fit(X_train,y_train)


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split



# List of classifiers to evaluate
classifiers = {
    "RandomForest": RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0),
    "GradientBoosting": GradientBoostingClassifier(random_state=0),
    "LogisticRegression": LogisticRegression(random_state=0),
    "SVC": SVC(random_state=0)
}

# Dictionary to store results
results = {}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    results[name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Print results
for name, metrics in results.items():
    print(f"Classifier: {name}")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1']:.4f}\n")

# Select the best classifier based on a chosen metric (e.g., F1 score)
best_classifier = max(results, key=lambda k: results[k]['f1'])
print(f"Best classifier based on F1 score: {best_classifier}")

Classifier: RandomForest
  Accuracy: 0.6429
  Precision: 0.5476
  Recall: 0.5190
  F1 Score: 0.4821

Classifier: GradientBoosting
  Accuracy: 0.6857
  Precision: 0.6500
  Recall: 0.5815
  F1 Score: 0.5727

Classifier: LogisticRegression
  Accuracy: 0.6571
  Precision: 0.3286
  Recall: 0.5000
  F1 Score: 0.3966

Classifier: SVC
  Accuracy: 0.6571
  Precision: 0.3286
  Recall: 0.5000
  F1 Score: 0.3966

Best classifier based on F1 score: GradientBoosting


## Evaluating the performance of the model on the testing dataset over here

In [22]:
data['Violence '].value_counts()

Violence 
1    261
0     86
Name: count, dtype: int64

In [52]:
class_weight={0:500,1:10}

In [53]:
classifier=RandomForestClassifier(random_state=0,class_weight=class_weight).fit(X_train,y_train)

In [54]:
y_pred=classifier.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [1 0]]


## Finding the metrics over here


In [55]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[ 6 18]
 [ 2 44]]
              precision    recall  f1-score   support

           0       0.75      0.25      0.38        24
           1       0.71      0.96      0.81        46

    accuracy                           0.71        70
   macro avg       0.73      0.60      0.59        70
weighted avg       0.72      0.71      0.66        70

0.7142857142857143
