### NEWS CRIME CLASSIFICATION

The aim of this project is to classifiy if a new article is crime related or not, the dataset is gotten from kaggle. The project is an NLP task and would be tackled by using traditional NLP techniques such as the TFID, Preprocessing and using the Naives-Bayes, logistic regression and random forest Classification to train the model.

In [10]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
#read the dataset
df = pd.read_csv('CrimeVsNoCrimeArticles.csv')
df.head()

Unnamed: 0,title,is_crime_report
0,What's New and Cool in the Fitness Sphere?,0
1,"'Today I Die,' Says Gang Leader Who Killed Sel...",1
2,Zero Jail Time For Cop Who Assaulted Disabled ...,1
3,Three Easy Gratitude Lessons,0
4,Can Change At UVA Make Campuses A Safer Place?,0


In [3]:
#Check dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7124 entries, 0 to 7123
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            7123 non-null   object
 1   is_crime_report  7124 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 111.4+ KB


In [4]:
#check for null value
df.isnull().sum()

title              1
is_crime_report    0
dtype: int64

In [5]:
#replace null values with a null string
df_cl = df.where((pd.notnull(df)), '')
df_cl.head()

Unnamed: 0,title,is_crime_report
0,What's New and Cool in the Fitness Sphere?,0
1,"'Today I Die,' Says Gang Leader Who Killed Sel...",1
2,Zero Jail Time For Cop Who Assaulted Disabled ...,1
3,Three Easy Gratitude Lessons,0
4,Can Change At UVA Make Campuses A Safer Place?,0


In [6]:
#check for null value again
df_cl.isnull().sum()

title              0
is_crime_report    0
dtype: int64

In [7]:
#seperate into X and y which is or features and Target variables
X = df_cl['title']
y = df_cl['is_crime_report']

#split into Test and Train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


In [8]:
#Feature extraction
fe = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = fe.fit_transform(X_train)
X_test_features = fe.transform(X_test)

In [12]:
#train the models

#models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Naives Bayes': MultinomialNB()
}

#store the results
results = {
    'Models': [],
    'Train_Accuracy': [],
    'Test_Accuracy': []
}

for name, model in models.items():
    pipeline = Pipeline([
        ('Classifier', model)
    ])

    #train
    pipeline.fit(X_train_features, y_train)

    #predict
    train_pred = pipeline.predict(X_train_features)
    test_pred = pipeline.predict(X_test_features)

    #results
    train_accuracy = accuracy_score(train_pred, y_train)
    test_accuracy = accuracy_score(test_pred, y_test)

    #append results
    results['Models'].append(name)
    results['Train_Accuracy'].append(train_accuracy)
    results['Test_Accuracy'].append(test_accuracy)


In [13]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Models,Train_Accuracy,Test_Accuracy
0,Logistic Regression,0.952467,0.894294
1,Random Forest,1.0,0.890552
2,Naives Bayes,0.962495,0.879794
