# Grammar check using GPT-2 

#### The grammar score of each sentence is given at the end of this notebook dataframe under a column 'grammr_score'
#### The grammar score ranges from 0 to 100
#### The result dataframe is saved into grammar_result_df.csv

In [16]:
import pandas as pd

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 200)

In [1]:
import torch
import sys
import numpy as np
 
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load pre-trained model (weights)
with torch.no_grad():
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 
def score(sentence):
    tokenize_input = tokenizer.encode(sentence)
    tensor_input = torch.tensor([tokenize_input])
    loss=model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.detach().numpy())
 


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [140]:
def grammar_score(n,limit):
    if n:
        if n>=limit:
            return 0
        else:
            return (1-(n/limit))*100
    else:
        return 0
    
    

In [23]:
score("First thing is that I like this game, it's fantastic and amazing. Though I rate it 4 stars cuz it's right pedal stops working in the game and that's why I kept falling on the bridge.")

52.055805

In [97]:
data=pd.read_csv('review_data.csv')

In [98]:
data.shape

(30000, 4)

In [99]:
data.dtypes

text          object
star           int64
app_id        object
reviewDate    object
dtype: object

In [100]:
data.head(20)

Unnamed: 0,text,star,app_id,reviewDate
0,Anathi Khanyile,5,com.fingersoft.hillclimb,18/03/21
1,Tony bahut funny hai Hill climbing racing my favourite game,5,com.fingersoft.hillclimb,18/03/21
2,Teturwu,1,com.fingersoft.hillclimb,18/03/21
3,Hoooooooooooyaaaaaaaaa what a game hooooooooooooyaaaaaaaa,5,com.fingersoft.hillclimb,18/03/21
4,This game is nice,5,com.fingersoft.hillclimb,18/03/21
5,Rahulyadavo,5,com.fingersoft.hillclimb,18/03/21
6,"First thing is that I like this game, it's fantastic and amazing. Though I rate it 4 stars cuz it's right pedal stops working in the game and that's why I kept falling on the bridge.",4,com.fingersoft.hillclimb,18/03/21
7,Very taty,3,com.fingersoft.hillclimb,18/03/21
8,good,5,com.fingersoft.hillclimb,18/03/21
9,I LIKE THIS GAME,5,com.fingersoft.hillclimb,18/03/21


#### There are smileys in the text field which is dirty values for grammar checks. 
#### We are not going to remove any other special characters because punctuations are necessary.
#### Even lower casing wont be done , because even that's part of grammatical correctness check.
#### So we will just remove emojis from text field.

In [101]:
def clean(text): 
    # remove non english characters and emojis and lower case
    clean_text = text.encode('ascii',errors='ignore').decode('ascii',errors='ignore').strip()
    return clean_text

In [102]:
data['text']=data['text'].apply(lambda x: clean(str(x)))
data['text']=data['text'].apply(lambda x: ' ' if len(str(x))==0 else x)
data['text']=data['text'].replace(np.nan,' ')

In [103]:
data.head(30)

Unnamed: 0,text,star,app_id,reviewDate
0,Anathi Khanyile,5,com.fingersoft.hillclimb,18/03/21
1,Tony bahut funny hai Hill climbing racing my favourite game,5,com.fingersoft.hillclimb,18/03/21
2,Teturwu,1,com.fingersoft.hillclimb,18/03/21
3,Hoooooooooooyaaaaaaaaa what a game hooooooooooooyaaaaaaaa,5,com.fingersoft.hillclimb,18/03/21
4,This game is nice,5,com.fingersoft.hillclimb,18/03/21
5,Rahulyadavo,5,com.fingersoft.hillclimb,18/03/21
6,"First thing is that I like this game, it's fantastic and amazing. Though I rate it 4 stars cuz it's right pedal stops working in the game and that's why I kept falling on the bridge.",4,com.fingersoft.hillclimb,18/03/21
7,Very taty,3,com.fingersoft.hillclimb,18/03/21
8,good,5,com.fingersoft.hillclimb,18/03/21
9,I LIKE THIS GAME,5,com.fingersoft.hillclimb,18/03/21


In [104]:
# %%time
# data1['ppl_score']=data1['clean_text'].apply(lambda x:score(x))

In [105]:
inputfulllist=list(data['text'])

In [106]:
len(inputfulllist)

30000

In [107]:
input10k=inputfulllist[:10000]

In [108]:
%%time
scores10k=[]
for count,i in enumerate(input10k):
    scores10k.append(score(i))
    if (count+1) in [2000,4000,6000,8000,10000]:
        print(f"Done with {count+1} elements")

Done with 2000 elements
Done with 4000 elements
Done with 6000 elements
Done with 8000 elements
Done with 10000 elements
Wall time: 26min 33s


In [109]:
input20k=inputfulllist[10000:20000]

In [110]:
%%time
scores20k=[]
for count,i in enumerate(input20k):
    scores20k.append(score(i))
    if (count+1) in [5000,10000,15000,20000,25000,30000]:
        print(f"Done with {count+1} elements")

Done with 5000 elements
Done with 10000 elements
Wall time: 27min 41s


In [111]:
input25k=inputfulllist[20000:25000]

In [112]:
%%time
scores25k=[]
for count,i in enumerate(input25k):
    try :
        scores25k.append(score(i))
    except :
        scores25k.append(0)
    if (count+1) in [2000,4000,15000,20000,25000,30000]:
        print(f"Done with {count+1} elements")

Done with 2000 elements
Done with 4000 elements
Wall time: 23min 9s


In [113]:
input30k=inputfulllist[25000:30000]

In [114]:
%%time
scores30k=[]
for count,i in enumerate(input30k):
    try :
        scores30k.append(score(i))
    except :
        scores30k.append(0)
    if (count+1) in [1000,2000,3000,4000,5000]:
        print(f"Done with {count+1} elements")

Done with 1000 elements
Done with 2000 elements
Done with 3000 elements
Done with 4000 elements
Done with 5000 elements
Wall time: 22min 40s


In [115]:
final_scores=scores10k+scores20k+scores25k+scores30k

In [116]:
len(final_scores)

30000

In [117]:
data['Scores']=final_scores

In [126]:
data[data.Scores>500].head(30)

Unnamed: 0,text,star,app_id,reviewDate,Scores
0,Anathi Khanyile,5,com.fingersoft.hillclimb,18/03/21,11924.070312
1,Tony bahut funny hai Hill climbing racing my favourite game,5,com.fingersoft.hillclimb,18/03/21,2675.807373
2,Teturwu,1,com.fingersoft.hillclimb,18/03/21,10662.136719
5,Rahulyadavo,5,com.fingersoft.hillclimb,18/03/21,2475.306152
7,Very taty,3,com.fingersoft.hillclimb,18/03/21,534.715332
10,Racing,5,com.fingersoft.hillclimb,18/03/21,7190.431641
13,Very noob game,1,com.fingersoft.hillclimb,18/03/21,2069.588623
14,Hill climb racing 2 vip game,5,com.fingersoft.hillclimb,18/03/21,6940.280762
15,Good games,4,com.fingersoft.hillclimb,18/03/21,6374.818359
16,I is so oosem game,5,com.fingersoft.hillclimb,18/03/21,1158.699097


In [129]:
data.rename(columns = {'Scores':'model_score'}, inplace = True)

In [130]:
data.head()

Unnamed: 0,text,star,app_id,reviewDate,model_score
0,Anathi Khanyile,5,com.fingersoft.hillclimb,18/03/21,11924.070312
1,Tony bahut funny hai Hill climbing racing my favourite game,5,com.fingersoft.hillclimb,18/03/21,2675.807373
2,Teturwu,1,com.fingersoft.hillclimb,18/03/21,10662.136719
3,Hoooooooooooyaaaaaaaaa what a game hooooooooooooyaaaaaaaa,5,com.fingersoft.hillclimb,18/03/21,53.693508
4,This game is nice,5,com.fingersoft.hillclimb,18/03/21,191.873611


In [143]:
data['grammr_score']=data['model_score'].apply(lambda x: grammar_score(x,500))

In [145]:
data['grammr_score']=data['grammr_score'].replace(np.nan,0)

In [146]:
data.head(100)

Unnamed: 0,text,star,app_id,reviewDate,model_score,grammr_score
0,Anathi Khanyile,5,com.fingersoft.hillclimb,18/03/21,11924.070312,0.0
1,Tony bahut funny hai Hill climbing racing my favourite game,5,com.fingersoft.hillclimb,18/03/21,2675.807373,0.0
2,Teturwu,1,com.fingersoft.hillclimb,18/03/21,10662.136719,0.0
3,Hoooooooooooyaaaaaaaaa what a game hooooooooooooyaaaaaaaa,5,com.fingersoft.hillclimb,18/03/21,53.693508,89.261298
4,This game is nice,5,com.fingersoft.hillclimb,18/03/21,191.873611,61.625278
5,Rahulyadavo,5,com.fingersoft.hillclimb,18/03/21,2475.306152,0.0
6,"First thing is that I like this game, it's fantastic and amazing. Though I rate it 4 stars cuz it's right pedal stops working in the game and that's why I kept falling on the bridge.",4,com.fingersoft.hillclimb,18/03/21,52.055805,89.588839
7,Very taty,3,com.fingersoft.hillclimb,18/03/21,534.715332,0.0
8,good,5,com.fingersoft.hillclimb,18/03/21,,0.0
9,I LIKE THIS GAME,5,com.fingersoft.hillclimb,18/03/21,295.804474,40.839105


In [128]:
score('so reliable.')

593.1108

In [147]:
result_df=data
result_df.to_csv('grammar_result_df.csv',index=False)