In [1]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [3]:
!pip install pandas openpyxl



In [8]:
# Import the necessary libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from spellchecker import SpellChecker
import re
import pandas as pd
import os

# Load the model and tokenizer from Hugging Face
model_name = "textattack/roberta-base-CoLA"  # This is a grammar checking model
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to check grammar mistakes and show statistics
def check_grammar(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.softmax(outputs.logits, dim=-1)
    probs = predictions[:, 1].item()  # Probability of being grammatically correct

    return probs

# Function to preprocess text by removing punctuation
def preprocess_text(text):
    # Use regex to replace punctuation with spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    return text

# Function to check spelling mistakes and provide suggestions
def check_spelling(text):
    spell = SpellChecker()
    text = preprocess_text(text)
    misspelled_words = spell.unknown(text.split())

    mistakes = {}
    for word in misspelled_words:
        suggestions = spell.candidates(word)
        mistakes[word] = suggestions

    return mistakes

# Function to calculate the score based on grammar and spelling
def calculate_score(grammar_prob, misspelled_words_count):
    # Assign marks for grammar
    grammar_score = grammar_prob * 8

    # Assign marks for spelling
    if misspelled_words_count < 10:
        spelling_score = 2
    elif 10 <= misspelled_words_count <= 20:
        spelling_score = 1
    else:
        spelling_score = 0

    total_score = grammar_score + spelling_score

    return round(total_score, 2), round(grammar_score, 2), round(spelling_score, 2)

# Combined function to check both grammar and spelling and calculate the score
def evaluate_text(text):
    # Check grammar
    grammar_prob = check_grammar(text)

    # Check spelling
    spelling_mistakes = check_spelling(text)
    misspelled_words_count = len(spelling_mistakes)

    # Calculate the score
    total_score, grammar_score, spelling_score = calculate_score(grammar_prob, misspelled_words_count)

    return total_score

# Read transcripts from Excel or CSV file
def evaluate_transcripts(file_path):
    file_ext = os.path.splitext(file_path)[1].lower()
    if file_ext == '.csv':
        df = pd.read_csv(file_path)
    elif file_ext in ['.xls', '.xlsx']:
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")

    # Check if 'Transcript' column exists
    if 'Transcript' not in df.columns:
        raise ValueError("The file must contain a 'Transcript' column")

    # Add a column for scores if it doesn't exist
    if 'Score' not in df.columns:
        df['Score'] = 0

    # Evaluate each transcript and update the score
    for index, row in df.iterrows():
        transcript = row['Transcript']
        score = evaluate_text(transcript)
        df.at[index, 'Score'] = score

    # Save the updated DataFrame back to the file
    if file_ext == '.csv':
        df.to_csv(file_path, index=False)
    else:
        df.to_excel(file_path, index=False)
    print(f"Scores updated in {file_path}")

# Example usage
file_path = '/content/transcripts.xlsx'  # Path to your file
evaluate_transcripts(file_path)


Some weights of the model checkpoint at textattack/roberta-base-CoLA were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Scores updated in /content/transcripts.xlsx
