# RR Gensim Tfidf

In [1]:
import os
import re

In [2]:
import string

In [3]:
import pandas as pd
import numpy as np

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words

## Data

In [5]:
DIR = r"C:\Users\jokkl\Documents\Masters\S3\Projet en TAL\Data\CSV DB"
train = os.listdir(DIR)[3]
train

'20221010 195911 RR_Benchmark_TRAIN.csv'

In [6]:
df = pd.read_csv(DIR + os.sep + train, sep=";", encoding="UTF-8")

In [7]:
# Cleaning
def process_token(text):
    text = text.lower()
    for char in string.punctuation:
        text = text.replace(char, '')
    # text = text.replace(re.escape(string.punctuation), '')
    # text = re.sub(re.escape(string.punctuation), '', text)
    tokens = text.split(' ')
    for token in tokens:
        token = re.sub(re.escape(string.punctuation), '', token)
        if len(token) < 4 or not(token.isalpha()):
            tokens.remove(token)
    clean = ' '.join(token for token in tokens)
    return clean

In [8]:
df['cleaned'] = df['text'].apply(process_token)

In [9]:
df.isna().sum()

doc_id       0
result_id    0
text         0
RR           0
cleaned      0
dtype: int64

In [10]:
RR_types = df['RR'].unique()
RR_types

array(['PREAMBLE', 'NONE', 'FAC', 'ARG_RESPONDENT', 'RLC',
       'ARG_PETITIONER', 'ANALYSIS', 'PRE_RELIED', 'RATIO', 'RPC',
       'ISSUE', 'STA', 'PRE_NOT_RELIED'], dtype=object)

## N-grams

In [11]:
def get_n_grams(text, ngram=1):
    words = [word for word in text.split(' ') if word.isalpha() and word not in set(stopwords) and word!='']
    # print('Sentence:', words)
    temp = zip(*[words[i:] for i in range(0, ngram)])
    ngrams = [' '.join(ngram) for ngram in temp]
    return ngrams

### Unigram

In [76]:
ug = df

In [77]:
ug['ngrams'] = ug['cleaned'].apply(get_n_grams, 1)

In [78]:
ug.head(5)

Unnamed: 0,doc_id,result_id,text,RR,cleaned,ngrams,test,PREDICTION
0,1735,d7a902fe9c23417499a7ef782f9fbdeb,"IN THE HIGH COURT OF KARNATAKA, ...",PREAMBLE,high court karnataka circuit bench gulb...,"[high, court, karnataka, circuit, bench, gulba...","[PRE_NOT_RELIED, PREAMBLE, NONE, FAC, RLC, ANA...",PRE_NOT_RELIED
1,1735,8d41599e98424d9480c25109556a7d14,BEFORETHE HON'BLE MR.JUSTICE ANAND BYRAR...,PREAMBLE,beforethe honble mrjustice anand byrareddy cri...,"[beforethe, honble, mrjustice, anand, byraredd...","[PREAMBLE, NONE, FAC, STA, ARG_RESPONDENT, RLC...",PREAMBLE
2,1735,e501424117da40a7935c2d9f2fb2fe38,This Criminal Appeal is filed under Section 37...,PREAMBLE,this criminal appeal filed under section theco...,"[criminal, appeal, filed, section, thecode, cr...","[PREAMBLE, RPC, NONE, FAC, RLC, ARG_RESPONDENT...",PREAMBLE
3,1735,4825806388fe43d39f73354b10b5b32d,This appeal coming on for hearing this ...,PREAMBLE,this appeal coming for hearing this courtdeliv...,"[appeal, coming, hearing, courtdelivered, foll...","[PREAMBLE, NONE, RLC, RPC, FAC, ARG_PETITIONER...",PREAMBLE
4,1735,d6893a25f82948f8be17fc9e876fb716,Heard the learned Counsel for the appel...,NONE,heard learned counsel the appellant the lea...,"[heard, learned, counsel, appellant, learned, ...","[NONE, ARG_RESPONDENT, ARG_PETITIONER, RLC, RA...",NONE


In [14]:
from collections import defaultdict

In [15]:
ngram_counts = {x:defaultdict(lambda: 0) for x in RR_types}

In [16]:
for label, ddict in ngram_counts.items():
    print('Processing:', label)
    ngram_list = df[df['RR']==label].ngrams
    for sublist in ngram_list:
        for item in sublist:
            ddict[item] += 1

Processing: PREAMBLE
Processing: NONE
Processing: FAC
Processing: ARG_RESPONDENT
Processing: RLC
Processing: ARG_PETITIONER
Processing: ANALYSIS
Processing: PRE_RELIED
Processing: RATIO
Processing: RPC
Processing: ISSUE
Processing: STA
Processing: PRE_NOT_RELIED


In [17]:
ngrams = pd.DataFrame(ngram_counts)
ngrams.replace(np.nan, 0)
ngrams = ngrams.rename_axis('word').reset_index()

In [18]:
ngrams.head(5)

Unnamed: 0,word,PREAMBLE,NONE,FAC,ARG_RESPONDENT,RLC,ARG_PETITIONER,ANALYSIS,PRE_RELIED,RATIO,RPC,ISSUE,STA,PRE_NOT_RELIED
0,high,261.0,75.0,242.0,26.0,159.0,41.0,500.0,159.0,53.0,55.0,26.0,37.0,28.0
1,court,524.0,261.0,673.0,141.0,315.0,275.0,1720.0,627.0,202.0,242.0,78.0,177.0,74.0
2,karnataka,91.0,2.0,26.0,5.0,4.0,14.0,31.0,12.0,1.0,5.0,2.0,2.0,2.0
3,circuit,6.0,,,,,,1.0,,,,,,
4,bench,46.0,1.0,24.0,5.0,26.0,15.0,65.0,42.0,3.0,3.0,2.0,,19.0


In [38]:
top_1grams = {x:defaultdict(lambda: 0) for x in RR_types}

In [39]:
for col in ngrams.columns[1:]:
    ng_col = ngrams[['word', col]]
    desc = ng_col.sort_values(by=[col], ascending=False)
    desc = desc.dropna()
    descriptors = desc['word'].tolist()[:20]
    top_1grams[col] = {y:0 for y in descriptors}

In [40]:
top_1grams

{'PREAMBLE': {'court': 0,
  'high': 0,
  'appeal': 0,
  'years': 0,
  'state': 0,
  'appellant': 0,
  'criminal': 0,
  'judgment': 0,
  'case': 0,
  'section': 0,
  'respondent': 0,
  'commissioner': 0,
  'order': 0,
  'incometax': 0,
  'income': 0,
  'date': 0,
  'dated': 0,
  'accused': 0,
  'aged': 0,
  'advocate': 0},
 'NONE': {'court': 0,
  'signature': 0,
  'judgment': 0,
  'list': 0,
  'judge': 0,
  'appeal': 0,
  'order': 0,
  'learned': 0,
  'dated': 0,
  'civil': 0,
  'criminal': 0,
  'respondent': 0,
  'high': 0,
  'counsel': 0,
  'heard': 0,
  'appellate': 0,
  'jurisdiction': 0,
  'delivered': 0,
  'prosecution': 0,
  'appellant': 0},
 'FAC': {'accused': 0,
  'court': 0,
  'section': 0,
  'order': 0,
  'appellant': 0,
  'case': 0,
  'said': 0,
  'filed': 0,
  'assessee': 0,
  'appeal': 0,
  'dated': 0,
  'police': 0,
  'respondent': 0,
  'deceased': 0,
  'house': 0,
  'income': 0,
  'petitioner': 0,
  'complainant': 0,
  'high': 0,
  'stated': 0},
 'ARG_RESPONDENT': {'lear

In [41]:
import copy

In [63]:
def evaluate_class(text):
    ngram_list = copy.deepcopy(top_1grams)
    for label, ngrams in ngram_list.items():
        for word in text:
            if word in ngrams.keys():
                ngrams[word] += 1
    count_raw = defaultdict()
    for label, ngrams in ngram_list.items():
        count_raw[label] = sum(1 for item in ngrams.values() if item!=0)
    total = sum(count_raw.values())
    count = {k:round(v/total, 2) for k,v in count_raw.items() if total!=0}
    count = sorted(count, key=count.get, reverse=True)
    return count

In [67]:
def get_prediction(list_obj):
    if list_obj:
        label = list_obj[0]
    else:
        label = 'NONE'
    return label

In [64]:
df['test'] = df['ngrams'].apply(evaluate_class)

In [68]:
df['PREDICTION'] = df['test'].apply(get_prediction)

In [69]:
pd.reset_option('max_columns')
pd.reset_option('max_rows')
df.head(10)

Unnamed: 0,doc_id,result_id,text,RR,cleaned,ngrams,test,PREDICTION
0,1735,d7a902fe9c23417499a7ef782f9fbdeb,"IN THE HIGH COURT OF KARNATAKA, ...",PREAMBLE,high court karnataka circuit bench gulb...,"[high, court, karnataka, circuit, bench, gulba...","[PRE_NOT_RELIED, PREAMBLE, NONE, FAC, RLC, ANA...",PRE_NOT_RELIED
1,1735,8d41599e98424d9480c25109556a7d14,BEFORETHE HON'BLE MR.JUSTICE ANAND BYRAR...,PREAMBLE,beforethe honble mrjustice anand byrareddy cri...,"[beforethe, honble, mrjustice, anand, byraredd...","[PREAMBLE, NONE, FAC, STA, ARG_RESPONDENT, RLC...",PREAMBLE
2,1735,e501424117da40a7935c2d9f2fb2fe38,This Criminal Appeal is filed under Section 37...,PREAMBLE,this criminal appeal filed under section theco...,"[criminal, appeal, filed, section, thecode, cr...","[PREAMBLE, RPC, NONE, FAC, RLC, ARG_RESPONDENT...",PREAMBLE
3,1735,4825806388fe43d39f73354b10b5b32d,This appeal coming on for hearing this ...,PREAMBLE,this appeal coming for hearing this courtdeliv...,"[appeal, coming, hearing, courtdelivered, foll...","[PREAMBLE, NONE, RLC, RPC, FAC, ARG_PETITIONER...",PREAMBLE
4,1735,d6893a25f82948f8be17fc9e876fb716,Heard the learned Counsel for the appel...,NONE,heard learned counsel the appellant the lea...,"[heard, learned, counsel, appellant, learned, ...","[NONE, ARG_RESPONDENT, ARG_PETITIONER, RLC, RA...",NONE
5,1735,5ffc5502c95d45d7aae96937f128fdcb,2. The accused is in appeal in the following c...,FAC,accused appeal in following circumstances appe...,"[accused, appeal, following, circumstances, ap...","[RLC, PREAMBLE, FAC, RATIO, RPC, ISSUE, NONE, ...",RLC
6,1735,afca23e571ad475d89a6fa226ca771bd,The complainant Sonabai was a resident of Hanu...,FAC,complainant sonabai a resident hanumantwadi ba...,"[complainant, sonabai, resident, hanumantwadi,...","[FAC, PREAMBLE, NONE, ARG_RESPONDENT, RLC, ARG...",FAC
7,1735,7781f76712ed452dab7119e58f514065,"She had two daughters, Laxmibai and Sangeeta.",FAC,had daughters laxmibai sangeeta,"[daughters, laxmibai, sangeeta]",[],NONE
8,1735,13ad25d88e1a4cf7b1a0f8e49561acb5,She had lost her husband about 12 years prior ...,FAC,had lost husband about years prior the complaint,"[lost, husband, years, prior, complaint]","[PREAMBLE, NONE, FAC, ARG_RESPONDENT, RLC, ARG...",PREAMBLE
9,1735,bdd429d80cb149b280248e4e26d3f86d,She had performed the marriage of her elder da...,FAC,had performed marriage her elder daughter laxm...,"[performed, marriage, elder, daughter, laxmi, ...","[PREAMBLE, NONE, FAC, ARG_RESPONDENT, RLC, ARG...",PREAMBLE


## Evaluation

In [70]:
from sklearn import metrics

In [71]:
classification_report = metrics.classification_report(df['RR'], df['PREDICTION'], zero_division=1, digits=3)
print(classification_report)

                precision    recall  f1-score   support

      ANALYSIS      0.537     0.105     0.176     10695
ARG_PETITIONER      0.175     0.056     0.085      1315
ARG_RESPONDENT      0.089     0.309     0.138       698
           FAC      0.370     0.254     0.301      5744
         ISSUE      0.165     0.332     0.220       367
          NONE      0.126     0.728     0.215      1423
      PREAMBLE      0.246     0.314     0.276      4167
PRE_NOT_RELIED      0.077     0.184     0.108       158
    PRE_RELIED      0.264     0.171     0.208      1431
         RATIO      0.100     0.184     0.130       674
           RLC      0.146     0.291     0.194       752
           RPC      0.452     0.355     0.398      1081
           STA      0.191     0.372     0.253       481

      accuracy                          0.225     28986
     macro avg      0.226     0.281     0.208     28986
  weighted avg      0.365     0.225     0.223     28986



### Bigram