# seed

In [2]:
import random


def reset_numpy_seed(seed_value=42):
  try:
    # Set NumPy random seed
    import numpy as np
    np.random.seed(seed_value)
    print(f'NumPy random seed set with value: {seed_value}')
  except Exception as e:
    print(f'NumPy random seed was not set: {e}')
  return


def reset_tensorflow_seed(seed_value=42):
  try:
    # Set TensorFlow random seed
    import tensorflow as tf
    success = False
    # Here we have 2 different ways to set the seed
    # depending on the version of TensorFlow
    try:
      tf.random.set_seed(seed_value)
      success = True
    except Exception as e:
      pass
    try:
      tf.set_random_seed(seed_value)
      success = True
    except Exception as e:
      pass
    if success:
      print(f'TensorFlow random seed set with value: {seed_value}')
    else:
      print(f'TensorFlow random seed was not set')
  except Exception as e:
    print(f'TensorFlow random seed was not set: {e}')
  return


def reset_torch_seed(seed_value=42):
  try:
    # Set PyTorch random seed
    import torch
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
      torch.cuda.manual_seed(seed_value)
      torch.cuda.manual_seed_all(seed_value)  # if you are using multiple GPUs
    print(f'PyTorch random seed set with value: {seed_value}')
  except Exception as e:
    print(f'PyTorch random seed was not set: {e}')
  return


def set_random_seeds(seed_value=42):
  # Set Python random seed
  random.seed(seed_value)
  reset_numpy_seed(seed_value)
  reset_tensorflow_seed(seed_value)
  reset_torch_seed(seed_value)
  return


if __name__ == '__main__':
  # Set the desired seed value
  seed = 7757

  # Set random seeds
  set_random_seeds(seed)

NumPy random seed set with value: 7757
TensorFlow random seed set with value: 7757
PyTorch random seed set with value: 7757


# intro

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
read_data = pd.read_csv(
    "/content/drive/MyDrive/NLP3/train.csv")


In [17]:
val_data = pd.read_csv('/content/drive/MyDrive/NLP3/test.csv')
val_data['class'] = [True for _ in range(len(val_data['title']))]

print(len(val_data))

36669


In [7]:
# class Stire:
#   def __init__(self, title, content, satire):
#     self.title = title
#     self.content = content
#     self.satire = satire

# preprocesare

In [8]:
import nltk
from nltk.stem.snowball import stopwords
nltk.download('stopwords')
import re

def preprocess(text):
    if type(text) is str:
        # Remove words with more than half numbers
        # text = re.sub(r'\b(?=\w\d)\w\d\w\b', ' ', text)

        # Correct badly hyphened words, badly hyphened measure units, and badly formatted numbers.
        text = re.sub(r'(?<=\S)-(?=\S)', '', text) # remove bad hyphens
        text = re.sub(r'(\d+)(\s)(%|°|€|¥|£|$|cm|mm|m|km|in|ft|yd|mi|g|kg|lb|oz)(?!\w)', r'\1\3', text) # correct badly hyphened measure units
        text = re.sub(r'(\d),(\d)', r'\1.\2', text) # correct badly formatted numbers

        # URLs
        text = re.sub(r'\(?\s*https?\S*\s*\)?', r' ', text)

        # Sites
        text = re.sub(r'\s*\s', r' ', text)

        text = re.sub(r'\(?\s*@\S*\s*\)?', r' Persoană ', text)

        # text = re.sub(r'@\w+', '', text)
        # text = text.replace('@', '') # remove soft hyphens
        # text = text.replace('\xad', '') # remove soft hyphens

        # Normalize dashes and other characters.
        text = text.replace('–', '-') # normalize dashes
        text = text.replace('—', '-') # normalize dashes
        text = text.replace('−', '-') # normalize dashes
        text = text.replace('‑', '-') # normalize dashes
        text = text.replace('“', '"') # normalize quotation marks
        text = text.replace('”', '"') # normalize quotation marks
        text = text.replace('„', '"') # normalize quotation marks
        text = re.sub(r'(?<=[.,;:])(?=[^\s])', r' ', text)

        # Reduce multiple spaces.
        text = re.sub(' +',' ',text)

        if text[0]==' ':
            text = text[1:]

        # diacritice = 'âăîțșÎĂȚȘÂ'
        # normale = 'aaitsiatsa'
        # for i in range(len(diacritice)):
        #     text = text.replace(diacritice[i], normale[i])

        return text

# numere de telefon
# hastaguri
# pic.twiter
# ++, --
#


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# titlu + content

In [12]:
title_train, title_test, content_train, content_test, satire_train, satire_test = train_test_split(read_data['title'], read_data['content'], read_data['class'], test_size=0.20)


# Concatenate title and content for training and testing data
train_data = [str(title) + ' ' + str(content) for title, content in zip(title_train, content_train)]
test_data = [str(title) + ' ' + str(content) for title, content in zip(title_test, content_test)]

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
train_data_vectors = vectorizer.fit_transform(train_data)
test_data_vectors = vectorizer.transform(test_data)


In [None]:
# Create SVM classifier with a linear kernel
clf_svm = svm.SVC(kernel='linear')

# Train the SVM classifier
clf_svm.fit(train_data_vectors, satire_train)

# Predict the labels for the test data
predicted_labels = clf_svm.predict(test_data_vectors)

# Calculate accuracy
accuracy = accuracy_score(satire_test, predicted_labels)
print("Accuracy:", accuracy)

# titlu

In [18]:
# title_train, title_test, satire_train, satire_test = train_test_split(read_data['title'].apply(lambda text: preprocess(text)), read_data['class'], test_size=0.20)
title_train = read_data['title']
satire_train = read_data['class']
title_val = val_data['title']

train_data = [str(title) for title in title_train]
# test_data = [str(title) for title in title_test]
val_data = [str(title) for title in title_val]

vectorizer = TfidfVectorizer()
train_data_vectors = vectorizer.fit_transform(train_data)
# test_data_vectors = vectorizer.transform(test_data)
val_data_vectors = vectorizer.transform(val_data)

In [19]:
# train - 22min
svm_classifier = svm.SVC(kernel='linear')
svm_classifier.fit(train_data_vectors, satire_train)

In [None]:
# predictions test
predicted_labels_title = svm_classifier.predict(test_data_vectors)

# Calculate accuracy
accuracy = accuracy_score(satire_test, predicted_labels_title)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Create confusion matrix
cm = confusion_matrix(satire_test, predicted_labels_title)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-satire', 'Satire'], yticklabels=['Non-satire', 'Satire'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


In [20]:
predicted_labels_title = svm_classifier.predict(val_data_vectors)

In [21]:
len(predicted_labels_title)

36669

In [23]:
def code_to_text(code):
  labels = [False, True]
  return labels[code]




y_pred = []
output = predicted_labels_title
for i in range(len(output)):
  try:
    item0, item1 = output[i]
    y_pred.extend([code_to_text(item0), code_to_text(item1)])
  except:
    item0 = output[i]
    y_pred.append(code_to_text(item0))
predict = y_pred


i = range(len(predict))
dict = {'id':i, 'class':predict}

final = pd.DataFrame(dict)

fc = final.columns[0]
final = final.drop([fc], axis=1)
# final.columns = ['Id', 'Label']
final.to_csv('final_labels2.csv')

  return labels[code]


# content

In [None]:
title_train, title_test, content_train, content_test, satire_train, satire_test = train_test_split(read_data['title'], read_data['content'], read_data['class'], test_size=0.20)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
train_data_vectors = vectorizer.fit_transform(content_train)
test_data_vectors = vectorizer.transform(content_test)


In [None]:
# Create SVM classifier with a linear kernel
clf_svm = svm.SVC(kernel='linear')

# Train the SVM classifier
clf_svm.fit(train_data_vectors, satire_train)

# Predict the labels for the test data
predicted_labels_content = clf_svm.predict(test_data_vectors)

# Calculate accuracy
accuracy = accuracy_score(satire_test, predicted_labels_content)
print("Accuracy:", accuracy)

# restu

In [None]:
# file_name = "/content/train_data.csv"

# reviews = pd.read_csv(file_name)

# text = reviews.Text
# trr = reviews["Final Labels"]
# trr = [str(x) for x in trr]

# reviews = []

# for i in range(len(trr)):
#   reviews.append(Review(text[i], trr[i]))

# reviews[5].text


'@ALETTAOCEANXXXX Ce prințesă frumoasă și sexxxxyyy ești'

In [None]:

# train, test = train_test_split(reviews, test_size = 0.2, random_state = 42)
# train = reviews
# train_x = [x.text for x in train]
# train_y = [x.sentiment for x in train]

# test_x = [x.text for x  in test]
# test_y = [x.sentiment for x in test]



In [18]:
# vectorizer = TfidfVectorizer()
# train_x_vectors = vectorizer.fit_transform(stire_train_df)
# test_x_vectors = vectorizer.transform(stire_test_df)
# tokenizer = AutoTokenizer.from_pretrained('readerbench/RoBERT-large')
# model = AutoModel.from_pretrained('readerbench/RoBERT-large')
# print(train_x_vectors[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/469 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/245k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

  (0, 2)	1.0


In [20]:
# clf_svm = svm.SVC(kernel  = 'linear')

# clf_svm.fit(train_x_vectors, train_y)

# clf_svm.predict(test_x_vectors[0])



NameError: name 'train_y' is not defined

In [None]:
# from sklearn import svm

# # Create SVM classifier with a linear kernel
# clf_svm = svm.SVC(kernel='linear')

# satire_list = [stire.satire for stire in stire_train_instances]
# # Fit the classifier using TF-IDF vectors of training data
# clf_svm.fit(train_x_vectors, satire_list)

# # Predict the class label for the first sample in the test set
# predicted_label = clf_svm.predict(test_x_vectors[0])

# print("Predicted label:", predicted_label)


In [None]:
# test_data = pd.read_csv('/content/test_data.csv')

# test_x = test_data.Text
# test_x = vectorizer.transform(test_x)





In [None]:
# predict = []

# predict = clf_svm.predict(test_x)

In [None]:
# i = range(len(predict))
# dict = {'Id':i, 'Label':predict}

# final = pd.DataFrame(dict)

# fc = final.columns[0]
# final = final.drop([fc], axis=1)
# # final.columns = ['Id', 'Label']
# final.to_csv('final.csv')

# #from sklearn.metrics import balanced_accuracy_score as bas
# #bas(test_y, clf_svm.predict(test_x_vectors))


In [None]:
# from sklearn.metrics import f1_score

# f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels = [Sentiment.Sexist_direct, Sentiment.Sexist_descriptiv, Sentiment.Sexist_reporting, Sentiment.Non_sexist_offensive, Sentiment.Non_sexist_non_offensive])


In [15]:
from sklearn.metrics import balanced_accuracy_score

def balanced_accuracy_evaluate(model, test_data):
    final_ans = []


    fin_loss, preds, labels = eval_epoch(
    model, test_dataloader, loss_crt, device)
    return preds, labels

In [16]:
test = Dataset(test_data)

test_dataloader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE)

predict, labels = balanced_accuracy_evaluate(model, test)

print(len(predict))

NameError: name 'Dataset' is not defined

In [None]:
def code_to_text(code):
  labels = [False, True]
  return labels[code]




y_pred = []
output = predict
for i in range(len(output)):
  try:
    item0, item1 = output[i]
    y_pred.extend([code_to_text(item0), code_to_text(item1)])
  except:
    item0 = output[i]
    y_pred.append(code_to_text(item0))
predict = y_pred


i = range(len(predict))
dict = {'id':i, 'class':predict}

final = pd.DataFrame(dict)

fc = final.columns[0]
final = final.drop([fc], axis=1)
# final.columns = ['Id', 'Label']
final.to_csv('final.csv')