#Sentence Analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install --quiet "deplacy" "trankit" "transformers"

In [None]:
import deplacy

import trankit
nlp_parser = trankit.Pipeline(lang="english", gpu=True)

Loading pretrained XLM-Roberta, this may take a while...
Loading tokenizer for english
Loading tagger for english
Loading lemmatizer for english
Loading NER tagger for english
Active language: english


In [None]:
%cd /content/drive/MyDrive/IELTS_Score

/content/drive/MyDrive/IELTS_Score


In [None]:
%cd /content

/content


In [None]:
import pandas as pd

##1.Simple, compound, complex sentence

###Simple sentence

In [None]:
simple_sentences = [
                    'Learning English is important nowadays.', # S-V
                    'Learning English and computer using are important nowadays.', # SS-V
                    'I play some video games and learn English on my computer.', # S-VV
                    'My sister and I play some video games and learn English on our computer.', # SS-VV
]

In [None]:
for sentence in simple_sentences:
    deplacy.serve(nlp_parser(sentence), port=None)


###Compound sentence

In [None]:
# Use conjuntions: F:for, A:and, N:nor, B:but, O:or, Y:yet, S:so

compound_sentences = [
                      'Playing video game is fun, but it can be dangerous too.',
                      'Nature does not hurry, yet everything is accomplished.',
                      'I am working now, but we will eat later. ',
                      'Playing video game is fun, but it can be dangerous too, we must be careful.'
]

In [None]:
for sentence in compound_sentences:
    deplacy.serve(nlp_parser(sentence), port=None)

###Complex sentence

In [None]:
# combine independent clauses with subordinate clauses
# use subordinating conjunction

complex_sentences = [
                    'Because I am working now, we will eat later. ',
                    'He always takes time to cover carefully his daughter even though he is extremely busy.',
                    'You should think about money saving from now if you want to study abroad.',
                    'Even though he is busy, he always takes time to cover carefully his daughter.'
]

In [None]:
for sentence in complex_sentences:
    deplacy.serve(nlp_parser(sentence), port=None)

###Passive voice sentence

In [None]:
passive_sentences = [
                     'The house was being painted when I arrived.',
                     'Over 20 models have been produced in the past two years.'
]

In [None]:
for sentence in passive_sentences:
    deplacy.serve(nlp_parser(sentence), port=None)

###Sentence Classifier

In [None]:
from collections import Counter
import pandas as pd

In [None]:
import trankit
nlp_parser = trankit.Pipeline(lang="english", gpu=True)

def nlp_sentence_parser(text):
    sentences, nlp_sentences = [], []
    for nlp_sentence in nlp_parser(text)['sentences']:
        sentence = nlp_sentence['text']
        nlp_sentence = nlp_sentence['tokens']
        pos_dep = [(nlp['text'], nlp['xpos'], nlp['upos'],
                    nlp['deprel'], nlp['head']) for nlp in nlp_sentence]

        sentences.append(sentence)
        nlp_sentences.append(pos_dep)

    return sentences, nlp_sentences

Loading pretrained XLM-Roberta, this may take a while...
Loading tokenizer for english
Loading tagger for english
Loading lemmatizer for english
Loading NER tagger for english
Active language: english


In [None]:
nlp_sentence_parser("I go to school")

(['I go to school'],
 [[('I', 'PRP', 'PRON', 'nsubj', 2),
   ('go', 'VBP', 'VERB', 'root', 0),
   ('to', 'IN', 'ADP', 'case', 4),
   ('school', 'NN', 'NOUN', 'obl', 2)]])

In [None]:
subject_tags = ['csubj', 'csubj:pass', 'nsubj', 'nsubj:pass', 'xsubj']

def sentence_classifier(question_respond: str) -> dict:
    simple_sentences, compound_sentences, complex_sentences, passive_sentences = [], [], [], []

    sentences, nlp_sentences = nlp_sentence_parser(question_respond)
    for idx, (sentence, nlp_sentence) in enumerate(list(zip(sentences, nlp_sentences))):
        adv_clause_counter, mark_counter = 0, 0
        subject_tag_counter, wh_clause_counter = 0, 0

        for idx, word_nlp in enumerate(nlp_sentence):
            text, xpos, upos, deprel, head = word_nlp
            if deprel == 'advmod' and upos == 'ADV':
                if xpos == 'WRB':
                    wh_clause_counter += 1
                else:
                    sub_nlp = nlp_sentence[idx:head] if idx < head else nlp_sentence[head:idx]
                    sub_nlp_counter = Counter([word_nlp[3] for word_nlp in sub_nlp])
                    sub_subject_tag = sum([sub_nlp_counter[subj] for subj in subject_tags])
                    adv_clause_counter += sub_subject_tag
            if deprel == 'mark':
                mark_counter += 1
            if deprel in subject_tags:
                subject_tag_counter += 1
        if mark_counter>=1 or adv_clause_counter>=1 or wh_clause_counter>=1:
            complex_sentences.append(sentence)
        elif mark_counter==0 and subject_tag_counter==1:
            simple_sentences.append(sentence)
        else:
            compound_sentences.append(sentence)

        # check passive voice sentence
        dep_sentence = [word_nlp[3] for word_nlp in nlp_sentence]
        if 'nsubj:pass' in dep_sentence and 'aux:pass' in dep_sentence:
            passive_sentences.append(sentence)

    return {
        'simple-sentences': simple_sentences,
        'compound-sentences': compound_sentences,
        'complex-sentences': complex_sentences,
        'passive-sentences':passive_sentences
    }

In [None]:
"".join(simple_sentences + compound_sentences + complex_sentences)

'Learning English is important nowadays.Learning English and computer using are important nowadays.I play some video games and learn English on my computer.My sister and I play some video games and learn English on our computer.Playing video game is fun, but it can be dangerous too.Nature does not hurry, yet everything is accomplished.I am working now, but we will eat later. Playing video game is fun, but it can be dangerous too, we must be careful.Because I am working now, we will eat later. He always takes time to cover carefully his daughter even though he is extremely busy.You should think about money saving from now if you want to study abroad.Even though he is busy, he always takes time to cover carefully his daughter.'

In [None]:
sentence_classifier("".join(simple_sentences + compound_sentences + complex_sentences))

{'simple-sentences': ['Learning English is important nowadays.',
  'Learning English and computer using are important nowadays.',
  'I play some video games and learn English on my computer.',
  'My sister and I play some video games and learn English on our computer.'],
 'compound-sentences': ['Playing video game is fun, but it can be dangerous too.',
  'Nature does not hurry, yet everything is accomplished.',
  'I am working now, but we will eat later.',
  'Playing video game is fun, but it can be dangerous too, we must be careful.'],
 'complex-sentences': ['Because I am working now, we will eat later.',
  'He always takes time to cover carefully his daughter even though he is extremely busy.',
  'You should think about money saving from now if you want to study abroad.',
  'Even though he is busy, he always takes time to cover carefully his daughter.'],
 'passive-sentences': ['Nature does not hurry, yet everything is accomplished.']}

**Some subordinating conjunctions**

In [None]:
!gdown 1WVxp5TXBu5ZOpMIP8wfbOpQU_VipMGgL
!unzip subordinating_conjunctions_test.zip

Downloading...
From: https://drive.google.com/uc?id=1WVxp5TXBu5ZOpMIP8wfbOpQU_VipMGgL
To: /content/subordinating_conjunctions_test.zip
  0% 0.00/4.02k [00:00<?, ?B/s]100% 4.02k/4.02k [00:00<00:00, 6.79MB/s]
Archive:  subordinating_conjunctions_test.zip
  inflating: subordinating_conjunctions_test.csv  


In [None]:
cj_df = pd.read_csv("./subordinating_conjunctions_test.csv")
cj_df

Unnamed: 0,id,text,gt
0,1,John plays games after he finishes his homework.,after
1,2,"Even though she is seriously ill, she still go...",even though
2,3,Sara begins to sneeze whenever she opens the w...,whenever
3,4,I go out after doing the housework.,after
4,5,Mike did not forget to greet everyone before l...,before
...,...,...,...
121,122,She seemed undecided about whether to stay or go.,whether
122,123,He did my task while I was away from the office.,while
123,124,Do you know the person who is standing near th...,who
124,125,Whoever wins will get a cash prize.,whoever


In [None]:
infor_sentences = sentence_classifier(". ".join(cj_df.text))

In [None]:
len(infor_sentences['complex-sentences'])/len(cj_df)

0.8968253968253969

In [None]:
infor_sentences['passive-sentences']

['The lion is so fierce as he is painted.',
 'When I was young, I cried for hours after my dog was run over.',
 'Even if you perform your best, you won’t be appreciated..',
 'Now that everything is under control, the lockdown can be lifted..']

In [None]:
sentence_classifier(". ".join(passive_sentences))['passive-sentences']

['The house was being painted when I arrived..',
 'Over 20 models have been produced in the past two years.']

##2.Paraphrase sentence

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# MRPC RoBERTa Base Model
mrpc_model = "textattack/roberta-base-MRPC"
mrpc_tokenizer = AutoTokenizer.from_pretrained(mrpc_model)
mrpc_classifier = AutoModelForSequenceClassification.from_pretrained(mrpc_model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def preprocess_text(text, remove_special_character=True):
    replace_chars = list(string.punctuation.replace("'", "").replace("-", "").replace(".", "").replace(",", ""))
    try:
      if remove_special_character:
          for char in replace_chars:
              text = text.replace(char, " ")
      else:
          for char in replace_chars:
              text = text.replace(char, " " + char +" ")
      text = text.lower()

      return " ".join(text.split())
    except:
      print(text)

In [None]:
def check_paraphrased(text: str, threshold_paraphsed=0.65):
    sentences = sent_tokenize(text)
    pair_sentences, idx_sentences = [], []
    for idx1 in range(len(sentences)-1):
        for idx2 in range(idx1+1, len(sentences)):
                idx_sentences.append((idx1, idx2))
                sentence_1 = preprocess_text(sentences[idx1], True)
                sentence_2 = preprocess_text(sentences[idx2], True)
                pair_sentences.append((sentence_1, sentence_2))

    tokens = mrpc_tokenizer.batch_encode_plus(pair_sentences, max_length=64,
                                              padding='max_length',
                                              truncation=True,
                                              return_attention_mask=True,
                                              return_tensors="pt")
    classification_logits = mrpc_classifier(**tokens)
    results = torch.softmax(classification_logits['logits'], dim=1).tolist()

    paraphrased_sentences = [(sentences[idx_sentences[idx][0]], sentences[idx_sentences[idx][1]]) for idx in range(len(results)) if results[idx][1]>=threshold_paraphsed]

    return paraphrased_sentences

In [None]:
paraphrase_sentences = [
                        'Nowadays, more and more foreign students are going to English-speaking countries to learn the international language – English.',
                        'In recent times, a growing number of international students are learning English in English-speaking countries.',
                        'It is undoubtedly true that studying English in an English-speaking country is the best way, but it is not the only way to learn it.'
]

In [None]:
check_paraphrased(" ".join(paraphrase_sentences))

[('Nowadays, more and more foreign students are going to English-speaking countries to learn the international language – English.',
  'In recent times, a growing number of international students are learning English in English-speaking countries.')]

In [None]:
df = pd.read_csv('ok.csv', index_col=0)
df

Unnamed: 0,Question,Essay,COHERENCE AND COHESION,LEXICAL,GRAMMAR,TASK,OVERALL,QE
0,many manufactured food and drink products cont...,most of the food we are consuming is having hi...,9,4,4,5.0,5.5,many manufactured food and drink products cont...
1,the bar graph shows the global sales in billio...,the chart illustrates how many types of virtua...,9,4,9,9.0,8.0,the bar graph shows the global sales in billio...
2,goverments should spend money on railways rath...,with the advancements in the transportation in...,7,7,8,6.5,7.0,goverments should spend money on railways rath...
3,a friend has invited you to a family dinner at...,"dear sam, i hope you and your family are doing...",9,9,9,8.0,9.0,a friend has invited you to a family dinner at...
4,you have a full-time job and you are also doin...,"dear mr jerry, i am writing to inform you abou...",9,9,9,8.0,9.0,you have a full-time job and you are also doin...
...,...,...,...,...,...,...,...,...
173696,labour-saving devices such as dishwashers and ...,it is argued that new technology gadgets that ...,6,5,5,5.0,5.5,labour-saving devices such as dishwashers and ...
173697,some people prefer to spend their lives doing ...,people have different perspectives regarding t...,6,5,5,9.0,6.5,some people prefer to spend their lives doing ...
173698,some people think it is better to have friends...,there are people who say it's a desirable thin...,6,5,5,4.0,5.0,some people think it is better to have friends...
173699,some people believe that unpaid community serv...,it’s argued by few individuals that unpaid fac...,7,4,4,9.0,6.0,some people believe that unpaid community serv...


In [None]:
# define the function to be applied to each row in the DataFrame
def process_row_with_variable(row, bc_sentence_classifier):
    # access the broadcast variable
    sentence_classifier = bc_sentence_classifier.value

    # do something with the row
    result = sentence_classifier(row['Essay'])
    return result



In [None]:
df.loc[:, 'Essay']

0         most of the food we are consuming is having hi...
1         the chart illustrates how many types of virtua...
2         with the advancements in the transportation in...
3         dear sam, i hope you and your family are doing...
4         dear mr jerry, i am writing to inform you abou...
                                ...                        
173696    it is argued that new technology gadgets that ...
173697    people have different perspectives regarding t...
173698    there are people who say it's a desirable thin...
173699    it’s argued by few individuals that unpaid fac...
173700    the chart gives us information about the physi...
Name: Essay, Length: 173701, dtype: object

In [None]:
subject_tags = ['csubj', 'csubj:pass', 'nsubj', 'nsubj:pass', 'xsubj']

def sentence_classifier(question_respond: str) -> dict:
    simple_sentences, compound_sentences, complex_sentences, passive_sentences = [], [], [], []

    sentences, nlp_sentences = nlp_sentence_parser(question_respond)
    for idx, (sentence, nlp_sentence) in enumerate(list(zip(sentences, nlp_sentences))):
        adv_clause_counter, mark_counter = 0, 0
        subject_tag_counter, wh_clause_counter = 0, 0

        for idx, word_nlp in enumerate(nlp_sentence):
            text, xpos, upos, deprel, head = word_nlp
            if deprel == 'advmod' and upos == 'ADV':
                if xpos == 'WRB':
                    wh_clause_counter += 1
                else:
                    sub_nlp = nlp_sentence[idx:head] if idx < head else nlp_sentence[head:idx]
                    sub_nlp_counter = Counter([word_nlp[3] for word_nlp in sub_nlp])
                    sub_subject_tag = sum([sub_nlp_counter[subj] for subj in subject_tags])
                    adv_clause_counter += sub_subject_tag
            if deprel == 'mark':
                mark_counter += 1
            if deprel in subject_tags:
                subject_tag_counter += 1
        if mark_counter>=1 or adv_clause_counter>=1 or wh_clause_counter>=1:
            complex_sentences.append(sentence)
        elif mark_counter==0 and subject_tag_counter==1:
            simple_sentences.append(sentence)
        else:
            compound_sentences.append(sentence)

        # check passive voice sentence
        dep_sentence = [word_nlp[3] for word_nlp in nlp_sentence]
        if 'nsubj:pass' in dep_sentence and 'aux:pass' in dep_sentence:
            passive_sentences.append(sentence)

    return [len(simple_sentences), len(compound_sentences), len(complex_sentences), len(passive_sentences)]

In [None]:
import concurrent.futures as cf
with cf.ThreadPoolExecutor(8) as worker:
            for message in (df.loc[:, 'Essay']) :
                worker.submit(sentence_classifier, message)

In [None]:
def check_paraphrased(text: str, threshold_paraphsed=0.5):
    try:
      sentences = sent_tokenize(text)
      pair_sentences, idx_sentences = [], []
      for idx1 in range(len(sentences)-1):
          for idx2 in range(idx1+1, len(sentences)):
                  idx_sentences.append((idx1, idx2))
                  sentence_1 = preprocess_text(sentences[idx1], True)
                  sentence_2 = preprocess_text(sentences[idx2], True)
                  pair_sentences.append((sentence_1, sentence_2))

      tokens = mrpc_tokenizer.batch_encode_plus(pair_sentences, max_length=64,
                                                padding='max_length',
                                                truncation=True,
                                                return_attention_mask=True,
                                                return_tensors="pt")
      classification_logits = mrpc_classifier(**tokens)
      results = torch.softmax(classification_logits['logits'], dim=1).tolist()

      paraphrased_sentences = [(sentences[idx_sentences[idx][0]], sentences[idx_sentences[idx][1]]) for idx in range(len(results)) if results[idx][1]>=threshold_paraphsed]

      return [len(paraphrased_sentences)]
    except:
      return [0]

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

# create a Spark session
spark = SparkSession.builder.appName("ReadCSVAndProcessData").getOrCreate()

# read a CSV file
df = spark.read.csv("ok.csv", header=True, inferSchema=True)

In [None]:
df_100 = df.limit(10)
df_100.show()

In [None]:
from pyspark import SparkContext

# Stop any existing SparkContext
try:
    sc.stop()
except:
    pass

# Create a new SparkContext
sc = SparkContext("local", "ReadCSVAndProcessData")

In [None]:
# define the function to be applied to each row in the DataFrame
def process_row_with_variable(row, bc_sentence_classifier):
    # access the broadcast variable
    sentence_classifier = bc_sentence_classifier.value

    # do something with the row
    result = sentence_classifier(row['Essay'])
    return result

# create a broadcast variable for the sentence_classifier function
bc_sentence_classifier = sc.broadcast(sentence_classifier)

# apply the function to each row in the DataFrame using the map transformation
processed_df = df_100.rdd.map(lambda x: process_row_with_variable(x, bc_sentence_classifier)).toDF(["result"])

# display the processed DataFrame
processed_df.show()

In [None]:
df

In [None]:

def check_spelling(text):
    d=0
    matches = nlp_tool.check(text)
    for rules in matches:
        if len(rules.replacements)>0:
            if rules.ruleId in language_tool_dict['GRAMMAR']['id'] or rules.ruleId == 'EN_A_VS_AN':
                d+=1
    return [d]


In [None]:
check_spelling(df.loc[:, 'Essay'])

NameError: ignored

In [None]:
df = pd.read_csv('ok.csv', index_col=0)


In [None]:
df.head()

In [None]:
df = df[:100]

In [None]:
df

In [None]:
df = df['Essay']

In [None]:
import pandas as pd
import concurrent.futures

# def process_row(row):
#     # Perform some processing on each row of the DataFrame
#     return row['col1'] * 2

# def process_dataframe(df):
    # Select only the col1 column
   # df = df[['col1']]

    # Process each row in a separate thread
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_row, row) for index, row in df.iterrows()]
    for future in concurrent.futures.as_completed(futures):
        results.append(future.result())

    # Combine the results


# Example usage
# df = pd.read_csv('data.csv')
result = process_dataframe(df)
print(result)
This example creates a pool of 4 worker threads using concurrent.futures.ThreadPoolExecutor, and then submits each row of the DataFrame to be processed in a separate thread using the process_row function. The result of each row is collected and combined into a single DataFrame, which is then returned.

Keep in mind that multithreaded processing without splitting the DataFrame into chunks may not result in a significant speedup compared to single-threaded processing, especially if the processing performed on each row is relatively fast. Additionally, it's important to carefully evaluate the trade-offs involved with multithreaded processing, as in some cases, it may not result in a significant speedup.





In [None]:
import pandas as pd
import concurrent.futures

def process_row(row):
    # Perform some processing on each chunk of the DataFrame
    return sentence_classifier(row)
def process_dataframe(df):
    # Split the DataFrame into chunks

    # Process each chunk in a separate thread
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = [executor.submit(sentence_classifier, row) for row in df]
        for future in concurrent.futures.as_completed(futures):
            results.append(future.result())

    # Combine the results
    return df.concat(results)

# Example usage
#df = pd.read_csv('data.csv')
result = process_dataframe(df)
print(result)

In [None]:
def process_row(row):
    # do something with the row
    result = sentence_classifier(row['Essay'])
    return result

In [None]:
import pandas as pd
from multiprocessing import Pool

# example function to be applied to the DataFrame
def process_row(row):
    # do something with the row
    result = sentence_classifier(row['Essay'])
    return result

# number of threads to use
n_threads = 40

# create a Pool of workers
with Pool(n_threads) as p:
    # apply the function to each row in the DataFrame
    results = p.map(process_row, [row for index, row in df.iterrows()])

# add the results back to the DataFrame
df['result_1'] = results

In [None]:
df = df[:100]

In [None]:
df['G_features_1'] = df['Essay'].apply(sentence_classifier)

In [None]:
df['G_features_2'] = df['QE'].apply(check_paraphrased)

In [None]:
df['G_features_3'] = df['Essay'].apply(check_spelling)

In [None]:
df

#Word Analysis

###Basic Analysis

In [None]:
import string
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def get_info_words(text):
    #Count the number of the word in paragraph
    list_words = word_tokenize(text)
    num_words = len(list_words)

    return num_words

In [None]:
question_respond = """
Independence is a factor that is becoming stronger in our world once the opportunities for getting a job are increasing. However some people say that the disadvantages of being your own boss are overbalanced, because you will have to work harder and probably be more isolated. In this essay I will show why I believe the advantages outweigh.

Firstly, this globalized world comes up with different ways of employment so it is fast for some individuos get a new job. So, it can be noticed that people are becoming more and more entrepreneurs which is understandable since working for yourself is possible managing the business in your way and building a staff, for example, with your own priorities. Moreover, the gain of enriching experiences in various sectors also means there is no pressure from anyone to do the job in a certain method or at a specific time.

On the other hand, it is remarkable that in this case you have to work harder and be responsible so everything can be done, and must deal with any problems that may appear. However, I believe the advantages outweigh since you have more freedom consequently, you will be able to perform your duties with much more mastery and property. Along with this, it is really gratifying to see your idea come to life and put into practice.

In conclusion, there are some disadvantages of being an entrepreneur as it is harder work itself. Whereas the advantages stand out one time you have more freedom for running your business, also the  independence and a feeling of gratitude. Briefly, the advantages of being an entrepreneur outweigh the drawbacks."""

In [None]:
get_info_words(question_respond)

300

###Spelling and Grammar error

In [None]:
!gdown 1_NQXwSh9KALaS3BryP2FhetcOVyba94Y

Downloading...
From: https://drive.google.com/uc?id=1_NQXwSh9KALaS3BryP2FhetcOVyba94Y
To: /content/drive/MyDrive/IELTS_Score/language_tool_dictionary.pkl
  0% 0.00/155k [00:00<?, ?B/s]100% 155k/155k [00:00<00:00, 68.5MB/s]


In [None]:
!pip install language_tool_python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting language_tool_python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language_tool_python
Successfully installed language_tool_python-2.7.1


In [None]:
import language_tool_python
nlp_tool = language_tool_python.LanguageTool('en-US')

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:02<00:00, 79.4MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp13a4ms2b.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to /root/.cache/language_tool_python.


In [None]:
import pickle
def load_language_tool_dict(path):
    with open(path + '/language_tool_dictionary.pkl', 'rb') as f:
        language_tool_dict = pickle.load(f)
    return language_tool_dict

In [None]:
language_tool_dict = load_language_tool_dict('/content/drive/MyDrive/IELTS_Score/')

In [None]:
language_tool_dict['GRAMMAR']['id']

{'ABLE_VBP',
 'ACCORDING_TO',
 'ACCORDING_TO_ME',
 'ACCUSTOMED_TO',
 'ACHE_COMPOUNDS',
 'ADDITIONAL',
 'ADMIT_ENJOY_VB',
 'ADVERB_OR_HYPHENATED_ADJECTIVE',
 'ADVERB_VERB_ADVERB_REPETITION',
 'ADVISE_VBG',
 'AD_AND',
 'AFFECTS',
 'AFFORD_VB',
 'AFFORD_VBG',
 'AFRAID_OF_HEIGHT',
 'AFRO_AMERICAN_HYPHEN',
 'AFTERMARKET',
 'AFTER_NOON',
 'AGREEMENT_QUESTION',
 'AGREEMENT_SENT_START',
 'AGREEMENT_SENT_START_2',
 'AGREEMENT_THEIR_HIS',
 'AIRCRAFTS',
 'AIR_BNB',
 'AIR_COMPOUNDS',
 'ALKA_SELTZER',
 'ALLOW_TO',
 'ALLOW_TO_DO',
 'ALL_GIRLS_HYPHEN',
 'ALL_IN_ONE_HYPHEN',
 'ALL_KNOWING_HYPHEN',
 'ALL_MOST_SOME_OF_NOUN',
 'ALL_NN',
 'ALL_TIME_HYPHEN',
 'ALL_WHEEL',
 'ALL_YOU_CAN_EAT_HYPHEN',
 'AMERICANO',
 'AM_I',
 'AND_BUT',
 'AND_END',
 'ANI_T',
 'ANTI_AMERICAN_HYPHEN',
 'ANYMORE_ADVERB',
 'ANY_WAY_TO_VB',
 'AN_ANOTHER',
 'AN_ARE',
 'AN_VB_PRP',
 'APPLE_PRODUCTS',
 'APRIL_FOOLS',
 'AREA_51',
 'ARE_WE_HAVE',
 'ARN_T',
 'ARRIVE_NNP',
 'ARTICLE_ADJECTIVE_OF',
 'ARTICLE_MISSING',
 'ARTICLE_UNNECESSARY

In [None]:
def check_spelling(text):
    error_and_correct_grammar = {}
    error_and_correct_spelling = {}
    matches = nlp_tool.check(text)
    for rules in matches:
        if len(rules.replacements)>0:
            start_positions = rules.offset
            end_positions = rules.errorLength+rules.offset
            error = text[rules.offset:rules.errorLength+rules.offset]
            correct = rules.replacements[0]
            print(correct)
            if rules.ruleId in language_tool_dict['GRAMMAR']['id'] or rules.ruleId == 'EN_A_VS_AN':
                error_and_correct_grammar[(start_positions, end_positions)] = (error, correct)
            else:
                error_and_correct_spelling[(start_positions, end_positions)] = (error, correct)
    return {
        "error_and_correct_grammar": error_and_correct_grammar,
        "error_and_correct_spelling": error_and_correct_spelling
        }

In [None]:
question_respond = "I is learning Englih"

In [None]:
check_spelling(question_respond)

am
English


{'error_and_correct_grammar': {(2, 4): ('is', 'am')},
 'error_and_correct_spelling': {(14, 20): ('Englih', 'English')}}

###CEFR Score

In [None]:
import trankit
nlp_parser = trankit.Pipeline(lang="english", gpu=True)

def nlp_sentence_parser(text):
    sentences, nlp_sentences = [], []
    for nlp_sentence in nlp_parser(text)['sentences']:
        sentence = nlp_sentence['text']
        nlp_sentence = nlp_sentence['tokens']
        pos_dep = [(nlp['text'], nlp['xpos'], nlp['upos'],
                    nlp['deprel'], nlp['lemma']) for nlp in nlp_sentence]

        sentences.append(sentence)
        nlp_sentences.append(pos_dep)

    return sentences, nlp_sentences

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

http://nlp.uoregon.edu/download/trankit/v1.0.0/xlm-roberta-base/english.zip


Downloading: 100%|██████████| 47.9M/47.9M [00:04<00:00, 10.3MiB/s]


Loading pretrained XLM-Roberta, this may take a while...


Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Loading tokenizer for english
Loading tagger for english
Loading lemmatizer for english
Loading NER tagger for english
Active language: english


In [None]:
!gdown 1Ls0_r2OgVvMyng4HtFj22uxTly4kRJlv

Downloading...
From: https://drive.google.com/uc?id=1Ls0_r2OgVvMyng4HtFj22uxTly4kRJlv
To: /content/drive/MyDrive/IELTS_Score/vocab_cefr_en.csv
  0% 0.00/163k [00:00<?, ?B/s]100% 163k/163k [00:00<00:00, 26.5MB/s]


In [None]:
import pandas as pd
def build_cefr_dict(path):
    cefr_dict_en = pd.read_csv(path + "/vocab_cefr_en.csv")
    cefr_dict = {}
    for i in range(len(cefr_dict_en)):
        word = cefr_dict_en['word'][i]
        pos = cefr_dict_en['pos'][i]
        cefr = cefr_dict_en['cefr'][i]
        cefr_dict[(word, pos)] = cefr
    return cefr_dict

In [None]:
cefr_dict = build_cefr_dict('/content')

FileNotFoundError: ignored

In [None]:
cefr_dict

NameError: ignored

In [None]:
def cefr_vocab(text):
    text, text_nlp = nlp_sentence_parser(text)
    pos_words = [(word_nlp[4], word_nlp[2]) for word_nlp in text_nlp[0]]
    list_cefr = {'A1':[], 'A2':[], 'B1':[], 'B2':[], 'C1':[], 'C2':[]}
    for sub in pos_words:
        if sub in cefr_dict and sub[0] not in list_cefr[cefr_dict[sub]]:
            list_cefr[cefr_dict[sub]].append(sub[0])
    return list_cefr

In [None]:
cefr_vocab("people's voting habits are influenced by political, social and economic factors")

###Synonym words, antonym words

In [None]:
!pip install fasttext

In [None]:
import fasttext
from nltk import everygrams
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
!gdown 1LxvvZkS6mUavbltHHEfxp8mZQYaiBNnO # Syn, ant dictionary
!gdown 1MDrD4qSWNG51qU__kxuCXpWamzqInRBk # fasttext

In [None]:
def build_syn_ant_dict(path):
    syn_ant_dict = pd.read_csv(path + "/syn_ant_dic.csv")
    words_dict = {}
    for i in range(len(syn_ant_dict)):
        word = syn_ant_dict['word'][i]
        syns = str(syn_ant_dict['synonyms'][i])
        ants = str(syn_ant_dict['antonyms'][i])
        if syns == 'nan':
            syns = ''
        if ants == 'nan':
            ants = ''
        words_dict[word] = [s.strip() for s in syns.split(",")], [a.strip() for a in ants.split(",")]
    return words_dict

In [None]:
fasttext_model = fasttext.load_model('/content/cc.en.300.bin')
syn_ant_dict = build_syn_ant_dict('/content')

In [None]:
syn_ant_dict['comfortable'][0][:5], syn_ant_dict['comfortable'][1][:5]

In [None]:
def find_syn_ant(text, threshold_syn=0.5, threshold_ant=0.5):
    syn_list, ant_list = [], []
    ngram_list = list(everygrams(text.split(), min_len=1, max_len=8))
    word_list = [" ".join(list(word)) for word in ngram_list]
    for word in word_list:
        if syn_ant_dict.get(word)!=None:
            for syn in syn_ant_dict.get(word)[0]:
                if (syn in word_list) and (word != syn) and (syn not in word) and (word not in syn) and \
                    (syn, word) not in syn_list and (word, syn) not in syn_list and \
                    cosine_similarity((fasttext_model.get_word_vector(word), fasttext_model.get_word_vector(syn)))[1][0].item() >= threshold_syn:
                    syn_list.append((word, syn))
            for ant in syn_ant_dict.get(word)[1]:
                if (ant in word_list) and (word != ant) and (ant not in word) and (word not in ant) and \
                    (ant, word) not in ant_list and (word, ant) not in ant_list and \
                    cosine_similarity((fasttext_model.get_word_vector(word), fasttext_model.get_word_vector(ant)))[1][0].item() >= threshold_ant:
                    ant_list.append((word, ant))
    return syn_list, ant_list

In [None]:
ant_syn = ['We did not expect such a large number of people to attend the concert.',
           'He has a big house in California.'
           'She has a small apartment in Manhattan.'
]

In [None]:
find_syn_ant(" ".join(ant_syn))