In [None]:
!pip install simpletransformers

In [None]:
!pip install torch

In [3]:
import logging
import pandas as pd
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
import string
from sklearn.model_selection import train_test_split

In [4]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
df = pd.read_csv("train.csv")
df.drop(['Context'], axis = 1, inplace = True)
df

In [6]:
df = df.dropna()
df = df.drop_duplicates()

In [7]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
new_data = {'input_text': [], 'target_text': []}

for index, row in df.iterrows():
    input_text = remove_punctuation(row['Response'])  # Remove punctuation for input
    target_text = row['Response']
    new_data['input_text'].append(input_text)
    new_data['target_text'].append(target_text)

new_df = pd.DataFrame(new_data)

In [None]:
new_df

In [9]:
new_df.rename(columns={'train_X':'input_text','train_y':'target_text'},inplace=True)
train_df,eval_df=train_test_split(new_df,test_size=0.1)

In [10]:
model_args = Seq2SeqArgs()
model_args.num_train_epochs = 1
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.overwrite_output_dir=True
model_args.max_length=300
model_args.task_specific_params=None

In [None]:
model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,

)

In [None]:
model.train_model(train_df, eval_data=eval_df)

In [None]:
result = model.eval_model(eval_df)

In [None]:
eval_predict=model.predict(list(eval_df['input_text'].values))

In [None]:
eval_df['input_text'][:]

In [None]:
eval_predict[:1]

In [17]:
test_df = pd.read_csv("testing.csv")

In [None]:
print(test_df)

In [19]:
to_test=test_df['Testing'].values.tolist()

In [None]:
model.predict(to_test)

In [None]:
!pip install SpeechRecognition

In [23]:
import speech_recognition as sr

In [None]:
def audio_to_text(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
    return text

In [25]:
def transcribe_audio(filename):
    # create a speech recognition object
    r = sr.Recognizer()

    # use the audio file as the audio source
    with sr.AudioFile(filename) as source:
        audio = r.record(source)

    # try to recognize the speech in the recording
    try:
        return r.recognize_google(audio)
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand your audio"
    except sr.RequestError as e:
        return "Could not request results from Google Speech Recognition service; {0}".format(e)

In [26]:
audio_file = "harvard.wav"
audio_text = transcribe_audio(audio_file)

In [None]:
audio_predictions = model.predict([audio_text])

In [None]:
print("Audio Input Text:", audio_text)
print("Predictions:", audio_predictions)

In [34]:
def capitalize_sentences(text):
    sentences = text.split('. ')  # Split text into sentences
    capitalized_sentences = [sentence.capitalize() for sentence in sentences]  # Capitalize each sentence
    return '. '.join(capitalized_sentences)

In [None]:
audio_text_capitalize = transcribe_audio(audio_file)

In [None]:
audio_text_capitalize = capitalize_sentences(audio_text)  # Capitalize sentences
audio_predictions_capitalize = model.predict([audio_text_capitalize])

In [None]:
print("Audio Input Text:", audio_text_capitalize)
print("Predictions:", audio_predictions_capitalize)

In [None]:
import textstat

# Function to measure readability
def measure_readability(text):
    # Calculate Flesch Reading Ease score
    flesch_score = textstat.flesch_reading_ease(text)
    
    # Calculate Flesch-Kincaid Grade Level
    grade_level = textstat.flesch_kincaid_grade(text)
    
    return flesch_score, grade_level

In [None]:
# Measure readability of code snippet
flesch_score, grade_level = measure_readability(capitalized_texts)

# Print results
print("Flesch Reading Ease Score:", flesch_score)
print("Flesch-Kincaid Grade Level:", grade_level)

In [None]:
!pip install datasets

!pip install simpletransformers

!pip install tensorflow -U

import pandas as pd
from transformers import (AutoConfig, AutoModel, AutoTokenizer)
from nltk.tokenize import sent_tokenize
import nltk
from datasets import Dataset, ReadInstruction, load_dataset
import os
import tensorflow as tf
from simpletransformers.ner import NERModel, NERArgs


nltk.download('punkt')


if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU available. Switch to Runtime -> Change runtime type -> GPU in Google Colab.")

#config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
#config.gpu_options.per_process_gpu_memory_fraction = 0.9  # Adjust as needed
#tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

# from google.colab import drive
# drive.mount('/content/drive')

df = pd.read_csv("train.csv")
df.drop(['Context'], axis = 1, inplace = True)
df

df = df.dropna()
df = df.drop_duplicates()

print(df.describe())
df.info()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

responses = df['Response']
responses

def chunk_sentences(text):
    sentences = sent_tokenize(text)
    chunks = [' '.join(sentences[i:i+10]) for i in range(0, len(sentences), 6)]
    return chunks

chunks_list = responses.apply(chunk_sentences).tolist()
resulting_data = [chunk for sublist in chunks_list for chunk in sublist]
resulting_series = pd.Series(resulting_data)

resulting_series

import pandas as pd

def add_space_before_punctuation(text):
    punctuation_marks = set(".,;:!?-'()")
    result = []
    for char in text:
        if char in punctuation_marks and (not result or result[-1] != ' '):
            result.append(' ')
        result.append(char)
    return ''.join(result)

processed_data_series = resulting_series.apply(add_space_before_punctuation)

#print(processed_data_series)

processed_data_series

new_df = processed_data_series.to_frame(name='text')
punc_dataset = Dataset.from_pandas(new_df)
print(type(punc_dataset))

print(punc_dataset[2])

special_labels = {',': 'I-COMMA',
                  '.': 'I-DOT',
                  '?': 'I-QMARK',
                  '!': 'I-EMARK',
                  ':': 'I-COLON',
                  ';': 'I-SEMICOLON',
                  '-': 'I-HYPHEN', # Label for the hyphen (new)
                  "'": 'I-APOST'  } # Label for the apostrophe (new)
normal_label = 'O'

def descrete_and_label(list_of_lines):
    list_of_lists = []
    for i, line in enumerate(list_of_lines):
        tkn_line = tokenizer.tokenize(line)
        if len(tkn_line) < 10 or len(tkn_line) > 510:
            continue
        for word in line.split():
            lbl = normal_label
            brek = False
            sl = special_labels.get(word, None)
            if sl:
                if list_of_lists:
                    list_of_lists[-1][2] = sl
                    brek = True
            if not brek:
                list_of_lists.append([i, word, lbl])
    return list_of_lists

def save_dataset(ds, path):
    filtered = []
    filtered += [i['text'] for i in ds if len(i['text']) > 20]
    dataset_1 = descrete_and_label(filtered)
    train_data = pd.DataFrame(dataset_1, columns=["sentence_id", "words", "labels"])
    train_data.to_csv(path, index=False)

directory = 'processed_punc'
if not os.path.exists(directory):
    os.makedirs(directory)

binz = 10
for i in range(binz):
    start_pct = int(i * (100/binz))
    end_pct = int((i+1) * (100/binz))
    print(start_pct, end_pct)
    sub_dataset = punc_dataset.shard(num_shards=binz, index=i)
    print(len(sub_dataset))
    save_dataset(sub_dataset, f'./processed_punc/train{i}-{binz}.csv')

print(sub_dataset[5])

labels_set = list(special_labels.values()) + [normal_label]


model_args = NERArgs()
model_args.overwrite_output_dir = True
model_args.save_steps = -1
model = NERModel('bert',
                 'bert-base-uncased',
                 labels = labels_set,
                 args=model_args,
                 use_cuda=False)

binz = 10
for i in range(binz-1):
    print(f'\t\t\t\t\t\tBin {i+1} out of {binz}')
    bin = pd.read_csv(f'./processed_punc/train{i}-{binz}.csv').dropna()
    model.train_model(bin, output_dir=f'./bert_models/{i}')
    del bin

testdf = pd.read_csv('./processed_punc/train9-10.csv').dropna()


result, model_outputs, predictions = model.eval_model(testdf)
groups = [df for _, df in testdf.groupby('sentence_id')]
print(len(predictions) == len(groups))


trues = []
for i, sentence in enumerate(predictions):
    trues.append(groups[i]['labels'].to_list()[:len(sentence)])


print(len(trues) == len(predictions))


MY_TEST_TEXTS = ["There is no wrong or right way to define a relationship I believe each relationship we are in is an opportunity to expand and to know self on a deeper level We are conditioned to believe that we are not valued or worth much without the confirmation of others and the world around us Give yourself time and try to not go to those who are toxic and enjoy the drama of others lives as s way to avoid looking at themselves"]

removed_punctuations = []
for txt in MY_TEST_TEXTS:
    txt2 = txt
    for pun in special_labels.keys():
        txt2 = txt2.replace(pun, "")
    removed_punctuations.append(txt2)

removed_punctuations


result, _ = model.predict(removed_punctuations)


inv_special_labels = {v: k for k, v in special_labels.items()}
predicted_texts = []
for res in result:
    sentence = []
    for diction in res:
        sentence.append(list(diction.keys())[0])
        if list(diction.values())[0] in list(inv_special_labels.keys()):
            sentence.append(inv_special_labels[list(diction.values())[0]])
    predicted_texts.append(" ".join(sentence))

predicted_texts




!pip install SpeechRecognition

import speech_recognition as sr
import string

# Function to convert audio input to text
def audio_to_text(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
    return text

# Read audio input and convert to text
audio_file = "harvard.wav"
audio_text = audio_to_text(audio_file)

# Predict punctuation using the trained model
result, _ = model.predict([audio_text])

# Convert punctuation predictions to text
inv_special_labels = {v: k for k, v in special_labels.items()}
predicted_texts = []
for res in result:
    sentence = []
    for diction in res:
        sentence.append(list(diction.keys())[0])
        if list(diction.values())[0] in list(inv_special_labels.keys()):
            sentence.append(inv_special_labels[list(diction.values())[0]])
    predicted_texts.append(" ".join(sentence))

predicted_texts

import re

def capitalize_text(text):
    return re.sub(r"(?:^|(?<=\. ))\w", lambda x: x.group().capitalize(), text)

# Predict punctuation using the trained model
result, _ = model.predict([audio_text])

# Convert punctuation predictions to text
inv_special_labels = {v: k for k, v in special_labels.items()}
predicted_texts = []
for res in result:
    sentence = []
    for diction in res:
        sentence.append(list(diction.keys())[0])
        if list(diction.values())[0] in list(inv_special_labels.keys()):
            sentence.append(inv_special_labels[list(diction.values())[0]])
    predicted_texts.append(" ".join(sentence))

# Capitalize the predicted text
capitalized_texts = [capitalize_text(text) for text in predicted_texts]

capitalized_texts

import textstat

# Function to measure readability
def measure_readability(text):
    # Calculate Flesch Reading Ease score
    flesch_score = textstat.flesch_reading_ease(text)
    
    # Calculate Flesch-Kincaid Grade Level
    grade_level = textstat.flesch_kincaid_grade(text)
    
    return flesch_score, grade_level

# Measure readability of code snippet
flesch_score, grade_level = measure_readability(capitalized_texts)

# Print results
print("Flesch Reading Ease Score:", flesch_score)
print("Flesch-Kincaid Grade Level:", grade_level)

In [None]:
!pip install simpletransformers

!pip install torch

import logging
import pandas as pd
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
import string
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

#from google.colab import drive
#drive.mount('/content/drive')

df = pd.read_csv("train.csv")
df.drop(['Context'], axis = 1, inplace = True)
df

df = df.dropna()
df = df.drop_duplicates()

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
new_data = {'input_text': [], 'target_text': []}

for index, row in df.iterrows():
    input_text = remove_punctuation(row['Response'])  # Remove punctuation for input
    target_text = row['Response']
    new_data['input_text'].append(input_text)
    new_data['target_text'].append(target_text)

new_df = pd.DataFrame(new_data)

new_df

new_df.rename(columns={'train_X':'input_text','train_y':'target_text'},inplace=True)
train_df,eval_df=train_test_split(new_df,test_size=0.1)

model_args = Seq2SeqArgs()
model_args.num_train_epochs = 1
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.overwrite_output_dir=True
model_args.max_length=300
model_args.task_specific_params=None

model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,

)

model.train_model(train_df, eval_data=eval_df)

result = model.eval_model(eval_df)

eval_predict=model.predict(list(eval_df['input_text'].values))

eval_df['input_text'][:]

eval_predict[:1]

test_df = pd.read_csv("testing.csv")

print(test_df)

to_test=test_df['Testing'].values.tolist()

model.predict(to_test)

!pip install SpeechRecognition

import speech_recognition as sr

def audio_to_text(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
    return text

def transcribe_audio(filename):
    # create a speech recognition object
    r = sr.Recognizer()

    # use the audio file as the audio source
    with sr.AudioFile(filename) as source:
        audio = r.record(source)

    # try to recognize the speech in the recording
    try:
        return r.recognize_google(audio)
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand your audio"
    except sr.RequestError as e:
        return "Could not request results from Google Speech Recognition service; {0}".format(e)

audio_file = "harvard.wav"
audio_text = transcribe_audio(audio_file)

audio_predictions = model.predict([audio_text])

print("Audio Input Text:", audio_text)
print("Predictions:", audio_predictions)

def capitalize_sentences(text):
    sentences = text.split('. ')  # Split text into sentences
    capitalized_sentences = [sentence.capitalize() for sentence in sentences]  # Capitalize each sentence
    return '. '.join(capitalized_sentences)

audio_text_capitalize = transcribe_audio(audio_file)

audio_text_capitalize = capitalize_sentences(audio_text)  # Capitalize sentences
audio_predictions_capitalize = model.predict([audio_text_capitalize])

print("Audio Input Text:", audio_text_capitalize)
print("Predictions:", audio_predictions_capitalize)

import textstat

# Function to measure readability
def measure_readability(text):
    # Calculate Flesch Reading Ease score
    flesch_score = textstat.flesch_reading_ease(text)
    
    # Calculate Flesch-Kincaid Grade Level
    grade_level = textstat.flesch_kincaid_grade(text)
    
    return flesch_score, grade_level

# Measure readability of code snippet
flesch_score, grade_level = measure_readability(audio_predictions_capitalized)

# Print results
print("Flesch Reading Ease Score:", flesch_score)
print("Flesch-Kincaid Grade Level:", grade_level)