In [264]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
import re

### Reading the Data

In [265]:
df = pd.read_csv('Data_netapp.csv')
data = df.copy()
data.Punishment.fillna(0,inplace=True)
data['Punishment'].replace('Only court challan','0',inplace = True)
data['Type'].replace('Traffic and Criminal cases','Traffic and criminal cases',inplace = True)
data = data.reset_index(drop=True)
data.tail(15)

Unnamed: 0,Type,Judgement,Section,Offence,Punishment,Status,Type of punishment
274,Criminal cases,Not guilty,153 RPC 66 IT Act,"intent to cause roit, sending offensive messages",Committed to session court,Unresolved,Committed to session court
275,Traffic cases,Accused,279 IPC,rash driving,Fine 1000,Resolved,Fine
276,Traffic cases,Accused,279 IPC,rash driving,Fine 1000,Resolved,Fine
277,Criminal cases,Accused,269/188 IPC,"Negligently causing danger to life, Disobedien...",Fine 1000,Resolved,Fine
278,Criminal cases,Accused,188/269 IPC,Disobedience to order duly promulgated by publ...,0,Pending,0
279,Criminal cases,Accused,188 IPC 11PCAct,Disobedience to order duly promulgated by publ...,0,Pending,0
280,Traffic cases,Accused,279/IPC 3/181 179 MVAct,"Rash driving, without driving license,disobeyi...",0,Pending,0
281,Traffic cases,Accused,279 IPC 3/181 MVAct,"Rash Driving,without driving license",0,Pending,0
282,Criminal cases,Accused,188 IPC 11PCAct 3/181MVAct,Disobedience to order duly promulgated by pub...,0,Pending,0
283,Traffic cases,Accused,279 IPC,rash driving,0,Pending,0


In [266]:
Y = data[(data['Punishment']!=0)&(data['Type of punishment']!='0')]['Punishment']
X = data[(data['Punishment']!=0)&(data['Type of punishment']!='0')]['Offence']
Z = data[(data['Punishment']!=0)&(data['Type of punishment']!='0')]['Type of punishment']

In [268]:
X_train, X_test, Y_train, Y_test,Z_train, Z_test = train_test_split(X, Y,Z, random_state = 1)

### Replace "n't" with not

In [None]:
def convert_to_not(text):
    text = re.sub('n\'t', 'not', text)
    return text

In [None]:
data['Offence'] = data.Offence.apply(convert_to_not)
df.head(10)

### Removing stopwords and unnecessary data

In [269]:
stop_words = set(stopwords.words('english'))
stop_words.update(list(punctuation))
words = ['those', 'on', 'U','own', '’ve', 'yourselves', 'around', 'between', 'four', 'been', 'alone', 'off', 'am', 'then', 'other', 'can', 'regarding', 'hereafter', 'front', 'too', 'used', 'wherein', '‘ll', 'doing', 'everything', 'up', 'onto', 'never', 'either', 'how', 'before', 'anyway', 'since', 'through', 'amount', 'now', 'he', 'was', 'have', 'into', 'because', 'not', 'therefore', 'they', 'n’t', 'even', 'whom', 'it', 'see', 'somewhere', 'thereupon', 'nothing', 'whereas', 'much', 'whenever', 'seem', 'until', 'whereby', 'at', 'also', 'some', 'last', 'than', 'get', 'already', 'our', 'once', 'will', 'noone', "'m", 'that', 'what', 'thus', 'no', 'myself', 'out', 'next', 'whatever', 'although', 'though', 'which', 'would', 'therein', 'nor', 'somehow', 'whereupon', 'besides', 'whoever', 'ourselves', 'few', 'did', 'without', 'third', 'anything', 'twelve', 'against', 'while', 'twenty', 'if', 'however', 'herself', 'when', 'may', 'ours', 'six', 'done', 'seems', 'else', 'call', 'perhaps', 'had', 'nevertheless', 'where', 'otherwise', 'still', 'within', 'its', 'for', 'together', 'elsewhere', 'throughout', 'of', 'others', 'show', '’s', 'anywhere', 'anyhow', 'as', 'are', 'the', 'hence', 'something', 'hereby', 'nowhere', 'latterly', 'say', 'does', 'neither', 'his', 'go', 'forty', 'put', 'their', 'by', 'namely', 'could', 'five', 'unless', 'itself', 'is', 'nine', 'whereafter', 'down', 'bottom', 'thereby', 'such', 'both', 'she', 'become', 'whole', 'who', 'yourself', 'every', 'thru', 'except', 'very', 'several', 'among', 'being', 'be', 'mine', 'further', 'n‘t', 'here', 'during', 'why', 'with', 'just', "'s", 'becomes', '’ll', 'about', 'a', 'using', 'seeming', "'d", "'ll", "'re", 'due', 'wherever', 'beforehand', 'fifty', 'becoming', 'might', 'amongst', 'my', 'empty', 'thence', 'thereafter', 'almost', 'least', 'someone', 'often', 'from', 'keep', 'him', 'or', '‘m', 'top', 'her', 'nobody', 'sometime', 'across', '‘s', '’re', 'hundred', 'only', 'via', 'name', 'eight', 'three', 'back', 'to', 'all', 'became', 'move', 'me', 'we', 'formerly', 'so', 'i', 'whence', 'under', 'always', 'himself', 'in', 'herein', 'more', 'after', 'themselves', 'you', 'above', 'sixty', 'them', 'your', 'made', 'indeed', 'most', 'everywhere', 'fifteen', 'but', 'must', 'along', 'beside', 'hers', 'side', 'former', 'anyone', 'full', 'has', 'yours', 'whose', 'behind', 'please', 'ten', 'seemed', 'sometimes', 'should', 'over', 'take', 'each', 'same', 'rather', 'really', 'latter', 'and', 'ca', 'hereupon', 'part', 'per', 'eleven', 'ever', '‘re', 'enough', "n't", 'again', '‘d', 'us', 'yet', 'moreover', 'mostly', 'one', 'meanwhile', 'whither', 'there', 'toward', '’m', "'ve", '’d', 'give', 'do', 'an', 'quite', 'these', 'everyone', 'towards', 'this', 'cannot', 'afterwards', 'beyond', 'make', 'were', 'whether', 'well', 'another', 'below', 'first', 'upon', 'any', 'none', 'many', 'serious', 'various', 're', 'two', 'less', '‘ve']
stop_words.update(words)

In [270]:
# cleaning test data
for i in range(len(X_test)):
  idx = X_test.index[i]
  str = ""
  for j in X_test[idx].split(' '):
    if len(j.strip()) > 2 and j.strip()[:2] == 'JK':
      continue
    elif j.strip().lower() in stop_words:
      continue
    elif len(j.strip()) >= 1 and j.strip()[-1] == ',':
      str += j.strip()[:-1]
      str += " "
    else:
      str += j.strip()
      str += " "
  X_test[idx] = str
  print(X_test[idx])

Overtaking 184 motor vehicle act 
Unattended stationary vehicle engine running pubic place 
helmet U/S 129/194 
person age 18 years driving motor vehicle  
rash driving hurts rashly 
keeping vehicle stationary public place precautions  
Smoking public service vehicle  
D/L U/S 3/181 
uniform 
Overtaking 
Commerical vehicle plying pollution U/S 190(2) 
uniform 
Mobile Phone driving 
uniform 
Rash driving endanger life driving license 
Dangerous driving 
Overspeeding 
Dangerous driving 
Stunt driving wheeler  
keeping vehicle stationary precautions  
helmet U/S 129/194 
rash driving 
Overspeeding 
helmet U/S 129/194 
intent cause miscarriage raping pregnant woman 
rash driving hurts rashly 
seat belt 194B motor vehicle act 
Registration Number plate according pattern 
Riding motorcycle helment 129/194 
seat belt 194B motor vehicle act 
Disobedience order duly promulgated public servant cruelty animals 
uniform 
helmet U/S 129/194 
Disobedience order duly promulgated public servant cruelt

### Stemming and lemmatisation

In [271]:
import re
global str
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    print(text)
    text = re.sub(r'[^\w\s]', '', (text).lower().strip())
            
    ## Tokenize
    lst_text = text.split()  
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Stemming
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [272]:
X_train = X_train.apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True,lst_stopwords=stop_words))
X_train

rash driving
Registration Number plate not according to the pattern
attempt to murder
Without valid Insurance U/S 146/196
Stunt driving by a two wheeler U/S 184
Without insurance U/S 146/196
Without helmet U/S 129/194 D
Disobedience to order duly promulgated by public servant, Negligently causing danger to life, triple driving
Disobedience to order duly promulgated by public servant, cruelty with animals,dangerous driving, without driving license, disobeying authority 
Using vehicle in unsafe condition U/S 190(1) 
Without uniform
Driving without D/L U/S 3/181
Gaming and setting animals to fight
Without seat belt 194B motor vehicle act
Overspeeding
Without D/L U/S 3/181 , without seat belt 190
Disobeying signal of Police officer regulating traffic U/S 179
The scorpio, JK1420030039541 vehicle fitted with tinted glass. Hence seized w/s 206 Motor vehicle Act & sent to Police station Udhampur for safe custody.
Overspeeding
Without helmet U/S 129/194 D
wrongful restraint, voluntarily causing

275                                         rash driving
99           registration number plate according pattern
250                                       attempt murder
173                               valid insurance 146196
31                             stunt driving wheeler 184
                             ...                        
203                     seat belt 194b motor vehicle act
255    disobedience order duly promulgated public ser...
72           owner allowing juvenile drive motor vehicle
235    wrongful restraint voluntarily causing hurt ro...
37     overloading good vehicle 1133 1941 vehicle uns...
Name: Offence, Length: 208, dtype: object

In [273]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

Tf- Idf Vectorizer and SVC classifier

In [274]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
pipe = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression(solver='liblinear'))])
pipe.fit(X_train,Z_train)
pipe.score(X_train,Z_train)
pipe.score(X_test,Z_test)

0.9857142857142858

In [275]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

In [276]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
pipe = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression(solver='liblinear'))])

In [277]:
pipe1= Pipeline(steps=[('cv',CountVectorizer()),('rf',RandomForestClassifier())])
pipe1.fit(X_train,Z_train)
pipe1.score(X_train,Z_train)

0.9903846153846154

In [278]:
pipe1.score(X_test,Z_test)

0.9857142857142858

In [279]:
pipe2= Pipeline(steps=[('cv',CountVectorizer()),('rf',KNeighborsClassifier(n_neighbors=3))])
pipe2.fit(X_train,Z_train)
pipe2.score(X_train,Z_train)

0.9855769230769231

In [280]:
pipe2.score(X_test,Z_test)
Z_test

11           Fine
73           Fine
146          Fine
79           Fine
266          Fine
          ...    
138          Fine
189          Fine
222    Compromise
225          Fine
263          Fine
Name: Type of punishment, Length: 70, dtype: object

In [281]:
X_unresolved= data[data['Status']=='Pending']['Offence']
Z_unresolved = pipe1.predict(X_unresolved)
start = data[data['Status']=='Pending']['Offence'].index[0]
end = data[data['Status']=='Pending']['Offence'].index[-1]
data.loc[start:end]['Type of punishment'] = Z_unresolved

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[start:end]['Type of punishment'] = Z_unresolved


In [282]:
pipe.fit(X_train,Y_train)
pipe.score(X_train,Y_train)
pipe.score(X_test,Y_test)

0.7571428571428571

In [283]:
pipe1.fit(X_train,Y_train)
pipe1.score(X_train,Y_train)
pipe1.score(X_test,Y_test)

0.8

In [284]:
pipe2.fit(X_train,Y_train)
pipe2.score(X_train,Y_train)
pipe2.score(X_test,Y_test)

0.6857142857142857

In [285]:
Y_unresolved = pipe1.predict(X_unresolved)
Y_unresolved

array(['Fine 1000', 'Fine 250', 'Fine 6000', 'Fine 1000', 'Fine 5250',
       'Fine 1000', 'Fine 1000', 'Committed to session court ',
       'Fine 1000', 'Fine 1000', 'Fine 1000'], dtype=object)

In [286]:
data.loc[start:end]['Punishment'] = Y_unresolved
data.to_csv("Data_netapp2_update.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[start:end]['Punishment'] = Y_unresolved


In [287]:
import pickle
filename = 'trained_model_new.sav'
pickle.dump(pipe1,open(filename,'wb'))

In [288]:
loaded_model = pickle.load(open('trained_model_new.sav','rb'))

In [289]:
input_data = ['Without uniform','Over speeding']

In [290]:
# input_data_as_numpy_array = np.array(input_data)
# input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = loaded_model.predict(input_data)
print(prediction)

['Fine 500' 'Fine 500']
