# **Connect to Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Download data from csv file**

In [2]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/data.csv',encoding='utf-8')
df.head()
print(df["content"])

0       Theodosius Atallah Hanna, the Archbishop of Se...
1       75 years of ongoing #EthnicCleansing of #Pales...
2       #CNN #msnbc #foxnews @CNN @MSNBC @FoxNews  #Is...
3       Get ready for Faris Ishaq’s latest single comi...
4       #Israeli Police Turns #Jerusalem into Military...
                              ...                        
6285    #Palestine | Occupation forces raid the villag...
6286    “You’re anti-semetic” okay and?!?!?🇵🇸 #Palesti...
6287    The children who died at the hands of #SouthAf...
6288    Resistance is NOT terrorism \n#Palestine  http...
6289    #Palestine | Occupation forces heavily fire te...
Name: content, Length: 6290, dtype: object


# **Remove punctuation**

In [3]:
import pandas as pd
import string

def remove_punctuation(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

text= df['content'].apply(remove_punctuation)
print(text)

0       Theodosius Atallah Hanna the Archbishop of Seb...
1        75 years of ongoing EthnicCleansing of Palestine
2       CNN msnbc foxnews CNN MSNBC FoxNews  Israel is...
3       Get ready for Faris Ishaq’s latest single comi...
4       Israeli Police Turns Jerusalem into Military B...
                              ...                        
6285    Palestine  Occupation forces raid the village ...
6286    “You’re antisemetic” okay and🇵🇸 Palestine jew ...
6287    The children who died at the hands of SouthAfr...
6288    Resistance is NOT terrorism \nPalestine  https...
6289    Palestine  Occupation forces heavily fire tear...
Name: content, Length: 6290, dtype: object


# **Remove special chars**

In [4]:
import pandas as pd
import re

# Define a function to remove punctuation
def remove_special_chars(text):
    return re.sub(r'[^A-Za-z0-9\s]', ' ', text)

# Apply the function to the 'Text' column
text= text.apply(remove_special_chars)

print(text)

0       Theodosius Atallah Hanna the Archbishop of Seb...
1        75 years of ongoing EthnicCleansing of Palestine
2       CNN msnbc foxnews CNN MSNBC FoxNews  Israel is...
3       Get ready for Faris Ishaq s latest single comi...
4       Israeli Police Turns Jerusalem into Military B...
                              ...                        
6285    Palestine  Occupation forces raid the village ...
6286     You re antisemetic  okay and   Palestine jew ...
6287    The children who died at the hands of SouthAfr...
6288    Resistance is NOT terrorism \nPalestine  https...
6289    Palestine  Occupation forces heavily fire tear...
Name: content, Length: 6290, dtype: object


# **Convert to lowercase**

In [5]:
import pandas as pd

def convert_to_lowercase(text):
    return text.lower()

text1 = text.apply(convert_to_lowercase)
print(text1)

0       theodosius atallah hanna the archbishop of seb...
1        75 years of ongoing ethniccleansing of palestine
2       cnn msnbc foxnews cnn msnbc foxnews  israel is...
3       get ready for faris ishaq s latest single comi...
4       israeli police turns jerusalem into military b...
                              ...                        
6285    palestine  occupation forces raid the village ...
6286     you re antisemetic  okay and   palestine jew ...
6287    the children who died at the hands of southafr...
6288    resistance is not terrorism \npalestine  https...
6289    palestine  occupation forces heavily fire tear...
Name: content, Length: 6290, dtype: object


# **Remove stopwords**

In [6]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Replace 'english' with the appropriate language if needed
def remove_stopwords(sentence):
    words = sentence.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)
text2 = [remove_stopwords(sentence) for sentence in text1]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
text2 = pd.Series(text2)
print(text2)

0       theodosius atallah hanna archbishop sebastia g...
1              75 years ongoing ethniccleansing palestine
2       cnn msnbc foxnews cnn msnbc foxnews israel att...
3       get ready faris ishaq latest single coming soo...
4       israeli police turns jerusalem military barrac...
                              ...                        
6285    palestine occupation forces raid village qasra...
6286         antisemetic okay palestine jew israel witler
6287    children died hands southafrican apartheid loo...
6288    resistance terrorism palestine httpstcoyxzcozrpfi
6289    palestine occupation forces heavily fire tear ...
Length: 6290, dtype: object


# **Remove links**

In [9]:
import pandas as pd
import re

# Remove links from the 'Text' column
text3 = text2.replace(r'http[s]?\S+|www\.\S+', '', regex=True)

# Display the updated DataFrame
print(text3)

0       theodosius atallah hanna archbishop sebastia g...
1              75 years ongoing ethniccleansing palestine
2       cnn msnbc foxnews cnn msnbc foxnews israel att...
3       get ready faris ishaq latest single coming soo...
4       israeli police turns jerusalem military barrac...
                              ...                        
6285    palestine occupation forces raid village qasra...
6286         antisemetic okay palestine jew israel witler
6287    children died hands southafrican apartheid loo...
6288                      resistance terrorism palestine 
6289    palestine occupation forces heavily fire tear ...
Length: 6290, dtype: object


# **Remove number**

In [10]:
text4 = text3.replace(r'\d+', ' ', regex=True)

In [11]:
text4

0       theodosius atallah hanna archbishop sebastia g...
1                 years ongoing ethniccleansing palestine
2       cnn msnbc foxnews cnn msnbc foxnews israel att...
3       get ready faris ishaq latest single coming soo...
4       israeli police turns jerusalem military barrac...
                              ...                        
6285    palestine occupation forces raid village qasra...
6286         antisemetic okay palestine jew israel witler
6287    children died hands southafrican apartheid loo...
6288                      resistance terrorism palestine 
6289    palestine occupation forces heavily fire tear ...
Length: 6290, dtype: object

# **lemmatization**

In [12]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer

# Download required resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to perform lemmatization
def lemmatize_text(text):
    # Tokenize the text into individual words
    words = nltk.word_tokenize(str(text))
    
    # Perform part-of-speech tagging
    pos_tags = nltk.pos_tag(words)
    
    # Lemmatize each word based on its part-of-speech tag
    lemmatized_words = [lemmatizer.lemmatize(word, pos=tag[0].lower()) if tag[0].lower() in ['a', 'n', 'v'] else lemmatizer.lemmatize(word) for word, tag in pos_tags]
    
    # Join the lemmatized words back into a single string
    lemmatized_text = ' '.join(lemmatized_words)
    
    return lemmatized_text

# Apply lemmatization to the 'Text' column
text5 = text4.apply(lemmatize_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [13]:
text5

0       theodosius atallah hanna archbishop sebastia g...
1                  year ongoing ethniccleansing palestine
2       cnn msnbc foxnews cnn msnbc foxnews israel att...
3       get ready faris ishaq latest single come soon ...
4       israeli police turn jerusalem military barrack...
                              ...                        
6285    palestine occupation force raid village qasra ...
6286         antisemetic okay palestine jew israel witler
6287    child die hand southafrican apartheid look u d...
6288                       resistance terrorism palestine
6289    palestine occupation force heavily fire tear g...
Length: 6290, dtype: object

# **Remove non-english words**

In [14]:
import nltk
from nltk.corpus import words

# Download the English word corpus
nltk.download('words')

# Get the set of English words
english_words = set(words.words())

# Define a function to remove non-English words
def remove_non_english_words(text):
    words = nltk.word_tokenize(text)
    english_words_only = [word for word in words if word.lower() in english_words]
    return ' '.join(english_words_only)

# Apply the function to the 'Text' column
text8 = text5.apply(remove_non_english_words)


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [15]:
text8 

0       hanna archbishop orthodox patriarchate occupy ...
1                                            year ongoing
2       attack right let focus attack wonder people do...
3       get ready latest single come soon stream platf...
4           police turn military barrack ahead flag march
                              ...                        
6285    occupation force raid village south enter seve...
6286                                                     
6287    child die hand apartheid look u disrespect all...
6288                                 resistance terrorism
6289    occupation force heavily fire tear gas caniste...
Length: 6290, dtype: object

# **Tokenization**

In [16]:
import nltk
nltk.download('punkt')  # Download the necessary tokenizer data

from nltk.tokenize import word_tokenize

import numpy as np
# Tokenize the text
tokens = text8.apply(word_tokenize)
import nltk
nltk.download('punkt')  # Download the necessary tokenizer data

series = pd.Series(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
series

0       [hanna, archbishop, orthodox, patriarchate, oc...
1                                         [year, ongoing]
2       [attack, right, let, focus, attack, wonder, pe...
3       [get, ready, latest, single, come, soon, strea...
4       [police, turn, military, barrack, ahead, flag,...
                              ...                        
6285    [occupation, force, raid, village, south, ente...
6286                                                   []
6287    [child, die, hand, apartheid, look, u, disresp...
6288                              [resistance, terrorism]
6289    [occupation, force, heavily, fire, tear, gas, ...
Length: 6290, dtype: object

# **Install transformes**

In [18]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


# **Add a column of Toxic or not**
Use a pretrained model to classify comments as toxic or not

In [30]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
i = 0
is_toxic = []
for comment in text8:
  inputs = tokenizer.encode_plus(
      comment,
      add_special_tokens=True,
      padding="max_length",
      max_length=128,
      truncation=True,
      return_tensors="pt"
  )
  outputs = model(**inputs)
  predicted_label = outputs.logits.argmax().item()
  labels = ["Non-toxic", "Toxic"]
  if labels[predicted_label]=="Toxic":
    is_toxic.append(1)
  else:
    is_toxic.append(0)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

# **Calculate tfidf_matrix**

In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess the phrases and convert them to text
series_text = series.apply(lambda x: ' '.join(x))

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the series_text
tfidf_matrix = vectorizer.fit_transform(series_text)

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Create a new DataFrame with TF-IDF values
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# **Logistic Regression Model**

In [32]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, is_toxic, test_size=0.2, random_state=42)

# Step 3: Train the logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Step 4: Predict the labels for the test set
y_pred = classifier.predict(X_test)

# Step 5: Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)



# **Accuracy**

In [33]:
print(f'Accuracy: {accuracy}')

Accuracy: 0.9332273449920508
