In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('merged_dataset.csv')

In [3]:
df = df.rename(columns = {'class': 'class_name'})
df = df.rename(columns=lambda x: x.strip() if isinstance(x, str) else x)
df = df.drop(columns = ['class'], axis = 1)

In [4]:
df.sample(5)

Unnamed: 0.1,index,sentence,subject,class_name,Unnamed: 0
29968,1464,"First, MNCs can provide money for additional i...",sst,10.0,
7258,772,Make sure that the peel is perfectly flat on t...,science,9.0,
13048,1698,"For long, villagers had been suffering from i...",sst,9.0,
26684,873,The above discussion indicate that both RNA an...,bio,12.0,
22495,175,"As of now, the last theory seems the most plau...",history,12.0,


In [5]:
df.drop(columns=['Unnamed: 0'],inplace = True)

In [7]:
df['class_name'].unique()

array([nan,  7.,  8.,  6.,  9., 10., 11., 12.])

In [8]:
df = df.dropna()

In [9]:
df['class_name'].unique()

array([ 7.,  8.,  6.,  9., 10., 11., 12.])

## size of the dataset

In [10]:
for class_num in range(6, 13):
    df_class = df[df['class_name'] == class_num]
    print(class_num, df_class.shape)

6 (1040, 4)
7 (2537, 4)
8 (2252, 4)
9 (6575, 4)
10 (3380, 4)
11 (7489, 4)
12 (6184, 4)


In [11]:
df.shape

(29457, 4)

In [12]:
df['subject'].unique()

array(['science', 'eng', 'sst', 'HS', 'history', 'bio'], dtype=object)

In [13]:
df['subject'] = df['subject'].replace('sst', 'social_science')
df['subject'] = df['subject'].replace('eng', 'english')
df['subject'] = df['subject'].replace('HS', 'Home_science')
df['subject'] = df['subject'].replace('bio', 'biology')

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import r2_score,mean_squared_error
from textstat import automated_readability_index
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction.text import TfidfVectorizer


## <font color='red'>TRY</font>

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize
import string

def preprocessing_text(text):
    text = str(text)
    text = text.replace('“', '').replace('”', '').replace('–','').replace(',','').replace('’','')
    sentences = sent_tokenize(text)
    processed_sentences = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        unwanted_symbols = string.punctuation.replace('.', '').replace("'", '') 
        tokens = [token.lower() for token in tokens if token not in unwanted_symbols]
        processed_sentence = ' '.join(tokens)
        processed_sentences.append(processed_sentence)
    
    preprocessed_text = ' '.join(processed_sentences)
    return preprocessed_text

In [16]:
df['preprocessed_text'] = df['sentence'].apply(preprocessing_text)

In [17]:
df.sample(5)

Unnamed: 0,index,sentence,subject,class_name,preprocessed_text
15459,628,Nutritionists professionals who work in this f...,Home_science,11.0,nutritionists professionals who work in this f...
2546,729,You could not fix a nail on the wall or tie a ...,science,8.0,you could not fix a nail on the wall or tie a ...
3649,455,Not the cedar,english,7.0,not the cedar
6600,114,"In the gaseous state, the particles move about...",science,9.0,in the gaseous state the particles move about ...
1165,508,Similarly test the solutions listed in Table 3...,science,7.0,similarly test the solutions listed in table 3...


In [18]:
import textstat

def calculate_avg_words_per_sentence(text):
    sentences = textstat.sentence_count(text)
    words = textstat.lexicon_count(text)
    if sentences > 0:
        return words / sentences
    else:
        return 0

def calculate_avg_syllables_per_word(text):
    words = textstat.lexicon_count(text)
    syllables = textstat.syllable_count(text)
    if words > 0:
        return syllables / words
    else:
        return 0


df['avg_words_per_sentence'] = df['preprocessed_text'].apply(calculate_avg_words_per_sentence)
df['avg_syllables_per_word'] = df['preprocessed_text'].apply(calculate_avg_syllables_per_word)

In [19]:
def calculate_flesch_reading_ease(avg_words_per_sentence, avg_syllables_per_word):
    return 206.835 - (1.015 * avg_words_per_sentence) - (84.6 * avg_syllables_per_word)


df['flesch_reading_ease'] = df.apply(lambda row: calculate_flesch_reading_ease(row['avg_words_per_sentence'], row['avg_syllables_per_word']), axis=1)

In [20]:
df['preprocessed_text'] = df['preprocessed_text'].fillna('')

df['ARI'] = df['preprocessed_text'].apply(lambda x: automated_readability_index(str(x)))

In [21]:
df.sample(5)

Unnamed: 0,index,sentence,subject,class_name,preprocessed_text,avg_words_per_sentence,avg_syllables_per_word,flesch_reading_ease,ARI
14417,689,These preparations should ideally make it easi...,science,10.0,these preparations should ideally make it easi...,15.0,1.733333,44.97,9.6
28780,276,"Fortunately, in many parts of rural and urban ...",social_science,10.0,fortunately in many parts of rural and urban i...,21.0,1.809524,32.434286,15.3
28283,2472,We\nknow that plants and photosynthetic bacter...,biology,12.0,we know that plants and photosynthetic bacteri...,19.0,1.947368,22.802632,15.1
7148,662,"Of these inert elements, the SCIENCE Distribut...",science,9.0,of these inert elements the science distributi...,206.0,1.247573,-107.79966,93.5
15919,144,Larry and Herb were pumping like madmen,english,11.0,larry and herb were pumping like madmen,7.0,1.428571,78.872857,4.3


In [26]:
dfff = df[df['avg_words_per_sentence'] == 206]

## Droping row having ARI <= 0

In [28]:
count = 0
for value in df['ARI']:
    if(value <= 0):
        count = count + 1
print(count)      

1706


In [29]:
df = df[df['ARI'] > 0]
df.reset_index(drop=True, inplace=True)

In [30]:
count = 0
for value in df['ARI']:
    if(value <= 0):
        count = count + 1
print(count) 

0


## Improve accuracy

In [35]:
X = df['preprocessed_text']
y = df['ARI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:

vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


gbm = GradientBoostingRegressor()

gbm.fit(X_train_vect, y_train)
y_pred = gbm.predict(X_test_vect)

acc = r2_score(y_test, y_pred)
print("R2 Score:", acc)

R2 Score: 0.5327802513011113


## calculating extra info

In [37]:
def count_words(text):
    tokens =  word_tokenize(text)
    return len(tokens)

In [38]:
df['word_counts'] = df['preprocessed_text'].apply(count_words)

In [39]:
def sentence_count(text):
    sentences = sent_tokenize(text)
    return len(sentences)

In [40]:
df['sentence_count'] = df['preprocessed_text'].apply(sentence_count)

In [41]:
def unique_words(text):
    tokens = word_tokenize(text)
    return len(set(tokens))

In [42]:
df['unique_words'] = df['preprocessed_text'].apply(unique_words)

In [43]:
def type_token_ration(text):
    words = text.split()
    unique_words = set(words)
    ttr = len(unique_words)/ len(words)
    return ttr

In [44]:
df['TTR'] = df['preprocessed_text'].apply(type_token_ration)

In [45]:
import spacy

nlp = spacy.load("en_core_web_sm")

def parse_tree_height(sentence):
    doc = nlp(sentence)
    parse_tree = [token.head for token in doc]
    max_depth = max([depth(parse_tree, token) for token in doc])
    return max_depth

def depth(parse_tree, token):
    if token is parse_tree[token.i]:
        return 1
    else:
        return 1 + depth(parse_tree, parse_tree[token.i])


def average_parse_tree_height(text):
    sentences = [sent.text for sent in nlp(text).sents]
    parse_tree_heights = [parse_tree_height(sentence) for sentence in sentences]
    avg_height = sum(parse_tree_heights) / len(parse_tree_heights)
    return avg_height


df['avg_height'] = df['preprocessed_text'].apply(average_parse_tree_height)

## Calculating flesch-kincaid 

In [46]:
def calculate_flesch_kincaid_grade_level(row):
    return 0.39 * row['avg_words_per_sentence'] + 11.8 * row['avg_syllables_per_word'] - 15.59

In [47]:
df['flesch_grade_level'] = df.apply(calculate_flesch_kincaid_grade_level, axis=1)

## Column transformer for pipelining

In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [49]:

X_text = df['preprocessed_text']
X_numeric = df[['avg_words_per_sentence', 'avg_syllables_per_word', 'sentence_count', 'TTR', 'avg_height']]
y = df['flesch_grade_level']

text_preprocessor = TfidfVectorizer(max_features=1000)  
num_preprocessor = StandardScaler()

processor = ColumnTransformer(transformers=[
    ('text', text_preprocessor, 'preprocessed_text'),
    ('numeric', num_preprocessor, ['avg_words_per_sentence', 'avg_syllables_per_word', 'sentence_count', 'TTR', 'avg_height'])
])

X_processed = processor.fit_transform(df)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

print("Training set - Features:", X_train.shape, "Target:", y_train.shape)
print("Testing set - Features:", X_test.shape, "Target:", y_test.shape)


Training set - Features: (22200, 1005) Target: (22200,)
Testing set - Features: (5551, 1005) Target: (5551,)


## Models

In [73]:
gbm = GradientBoostingRegressor()

gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
acc = r2_score(y_test, y_pred)
print("R2 Score:", acc)
print("mse Score:", mse)

R2 Score: 0.8966669961295215
mse Score: 3.2943946117419434


In [74]:
gbm = RandomForestRegressor()

gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
acc = r2_score(y_test, y_pred)
print("R2 Score:", acc)
print("mse Score:", mse)

R2 Score: 0.8965378510312492
mse Score: 3.2985119305069586


In [50]:
from sklearn.svm import SVR

In [51]:
gbm = SVR()

gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
acc = r2_score(y_test, y_pred)
print("R2 Score:", acc)
print("mse Score:", mse)

R2 Score: 0.9858523470799867
mse Score: 0.31436081214737316


In [52]:
df.head()

Unnamed: 0,index,sentence,subject,class_name,preprocessed_text,avg_words_per_sentence,avg_syllables_per_word,flesch_reading_ease,ARI,word_counts,sentence_count,unique_words,TTR,avg_height,flesch_grade_level
0,1,In Class VI you learnt that food is essential ...,science,7.0,in class vi you learnt that food is essential ...,13.0,1.384615,76.501538,6.1,13,1,13,1.0,6.0,5.818462
1,2,"You also learnt that carbohydrates, proteins, ...",science,7.0,you also learnt that carbohydrates proteins fa...,14.0,1.785714,41.553571,12.5,14,1,14,1.0,6.0,10.941429
2,3,These components of food are called nutrients ...,science,7.0,these components of food are called nutrients ...,13.0,1.461538,69.993846,8.2,13,1,12,0.923077,6.0,6.726154
3,4,All living organisms require food,science,7.0,all living organisms require food,5.0,1.8,49.48,8.4,5,1,5,1.0,4.0,7.6
4,5,Plants can synthesise food for themselves but ...,science,7.0,plants can synthesise food for themselves but ...,12.0,1.666667,53.655,10.9,12,1,11,0.916667,7.0,8.756667


## GRADE

## <font color='red'>ARI</font>

In [53]:
def grade(score):
    score = round(score)
    if 1 <= score <= 6:
        return 'ESS'
    elif 7 <= score <= 8:
        return '12-13'
    elif 9 <= score <= 12:
        return '14-15'
    elif 13 <= score <= 16:
        return '16-18'
    elif 17 <= score <= 18:
        return 'CG'
    else:
        return 'PG'

In [54]:
df['GRADE'] = df['ARI'].apply(grade)

In [55]:
df.sample(10)

Unnamed: 0,index,sentence,subject,class_name,preprocessed_text,avg_words_per_sentence,avg_syllables_per_word,flesch_reading_ease,ARI,word_counts,sentence_count,unique_words,TTR,avg_height,flesch_grade_level,GRADE
26263,155,You may have realised the importance of these ...,social_science,10.0,you may have realised the importance of these ...,13.0,1.384615,76.501538,7.2,13,1,13,1.0,6.0,5.818462,12-13
4644,703,They are near our home and we can go there on ...,social_science,7.0,they are near our home and we can go there on ...,16.0,1.0,105.995,1.3,16,1,16,1.0,8.0,2.45,ESS
12114,103,A scale for measuring hydrogen ion concentrati...,science,10.0,a scale for measuring hydrogen ion concentrati...,16.0,1.6875,47.8325,10.7,16,1,14,0.875,10.0,10.5625,14-15
17885,727,They are triploblastic and coelomate animals,biology,11.0,they are triploblastic and coelomate animals,6.0,1.666667,59.745,12.2,6,1,6,1.0,5.0,6.416667,14-15
18449,1306,Mitochondria sing,biology,11.0,mitochondria sing,2.0,2.5,-6.695,17.3,2,1,2,1.0,3.0,14.69,CG
1168,109,Harvesting in our country is either done manua...,science,8.0,harvesting in our country is either done manua...,16.0,1.625,53.12,9.2,16,1,15,0.9375,7.0,9.825,14-15
26241,130,Soil erosion is also caused due to defective m...,social_science,10.0,soil erosion is also caused due to defective m...,11.0,1.545455,64.924545,6.8,11,1,11,1.0,6.0,6.936364,12-13
14230,327,"Large areas, officially designated as forest l...",english,11.0,large areas officially designated as forest la...,11.0,2.181818,11.088182,13.6,11,1,11,1.0,7.0,14.445455,16-18
18150,996,Cortical layers below hypodermis consist of ro...,biology,11.0,cortical layers below hypodermis consist of ro...,15.0,2.266667,-0.15,20.0,15,1,15,1.0,9.0,17.006667,PG
23412,3403,“I do not want any constitution in which the U...,history,12.0,i do not want any constitution in which the un...,24.0,1.291667,73.2,7.9,25,1,21,0.84,9.0,9.011667,12-13


## <font color='red'>Flesch-kincaid</font>

In [112]:
df = df.drop(columns = ['class'])

In [57]:
def FKGL(score):
    if score < 3:
        return "Very Easy"
    elif 3 <= score < 6:
        return "Easy"
    elif 6 <= score < 8:
        return "Medium"
    elif 8 <= score < 10:
        return "Difficult"
    else:
        return "Very Difficult"

In [58]:
df['FKGL'] = df['flesch_grade_level'].apply(FKGL)

In [59]:
df.head()

Unnamed: 0,index,sentence,subject,class_name,preprocessed_text,avg_words_per_sentence,avg_syllables_per_word,flesch_reading_ease,ARI,word_counts,sentence_count,unique_words,TTR,avg_height,flesch_grade_level,GRADE,FKGL
0,1,In Class VI you learnt that food is essential ...,science,7.0,in class vi you learnt that food is essential ...,13.0,1.384615,76.501538,6.1,13,1,13,1.0,6.0,5.818462,ESS,Easy
1,2,"You also learnt that carbohydrates, proteins, ...",science,7.0,you also learnt that carbohydrates proteins fa...,14.0,1.785714,41.553571,12.5,14,1,14,1.0,6.0,10.941429,14-15,Very Difficult
2,3,These components of food are called nutrients ...,science,7.0,these components of food are called nutrients ...,13.0,1.461538,69.993846,8.2,13,1,12,0.923077,6.0,6.726154,12-13,Medium
3,4,All living organisms require food,science,7.0,all living organisms require food,5.0,1.8,49.48,8.4,5,1,5,1.0,4.0,7.6,12-13,Medium
4,5,Plants can synthesise food for themselves but ...,science,7.0,plants can synthesise food for themselves but ...,12.0,1.666667,53.655,10.9,12,1,11,0.916667,7.0,8.756667,14-15,Difficult


## RESULT

In [60]:
result = df[['class_name', 'ARI', 'flesch_grade_level', 'GRADE', 'FKGL']]

In [69]:
result.sample(5)

Unnamed: 0,class_name,ARI,flesch_grade_level,GRADE,FKGL
17807,11.0,6.8,5.818462,12-13,Easy
7581,9.0,13.8,10.823478,16-18,Very Difficult
12083,10.0,7.6,6.936364,12-13,Medium
23386,12.0,12.8,10.723333,16-18,Very Difficult
7141,9.0,10.3,7.586667,14-15,Medium


## Input

In [62]:
import string

def predict_ari(input_text):
    preprocessed_text = preprocessing_text(input_text)
    words_per_sentence = calculate_avg_words_per_sentence(preprocessed_text)
    syllables_per_word = calculate_avg_syllables_per_word(preprocessed_text)
    sentenc_count = sentence_count(preprocessed_text) 
    avg_height = average_parse_tree_height(preprocessed_text)
    TTR        = type_token_ration(preprocessed_text)
    

    
    input_data = {'preprocessed_text': [preprocessed_text],
                  'avg_words_per_sentence': [words_per_sentence],
                  'avg_syllables_per_word': [syllables_per_word],
                  'sentence_count': [sentenc_count],
                  'TTR': [TTR],
                  'avg_height': [avg_height]}
    input_df = pd.DataFrame(input_data)

    
    input_processed = processor.transform(input_df)
    predicted_ari = gbm.predict(input_processed)
    
    return predicted_ari

In [63]:
input_text = "Spermatogenesis starts at the age of puberty due to significant increase in the secretion of gonadotropin releasing hormone"

In [65]:
print(predict_ari(input_text))

[14.28540869]


In [67]:
print(FKGL(14))

Very Difficult


In [70]:
print(grade(14))

16-18
