In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install Required libraries

In [5]:
cd /content/drive/MyDrive/UPwork/sentiment

/content/drive/MyDrive/UPwork/sentiment


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
import sklearn
import time
import spacy
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from tqdm import tqdm
tqdm.pandas()
import yaml

nltk.download('stopwords')
ar_stopwords = set(stopwords.words('arabic'))
ar_stopwords.update(["مع","من","إلى","في","فى","كان","على","علي", "يا"])

with open(r'keywords.yml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    config = yaml.load(file, Loader=yaml.FullLoader)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:

def remove_special(text):
    for letter in '#.][!XR':
      text = text.replace(letter,'')
    return text

def remove_punctuations(text):
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
    
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text     
        
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)  

def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    # #remove longation
    # p_longation = re.compile(r'(.)\1+')
    # subst = r"\1\1"
    # text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')

    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

    
def keep_only_arabic(text):
    return re.sub(r'[a-zA-Z?]', '', text).strip()
    


def split_hashtag_to_words(tag):
    tag = tag.replace('#','')
    tags = tag.split('_')
    if len(tags) > 1 :
        
        return tags
    pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])")
    return pattern.findall(tag)

def clean_hashtag(text):
    words = text.split()
    text = list()
    for word in words:
        if is_hashtag(word):
            text.extend(extract_hashtag(word))
        else:
            text.append(word)
    return " ".join(text)

def is_hashtag(word):
    if word.startswith("#"):
        return True
    else:
        return False

def extract_hashtag(text):
    
    hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")])
    word_list = []
    for word in hash_list :
        word_list.extend(split_hashtag_to_words(word))
    return word_list

def remove_mentions(text):
    words = text.split()
    words = [w for w in words if not w.startswith("@")]
    return " ".join(words)

def replace_words(text):
    splitted_text = text.split()
    for i in range(len(splitted_text)):
        if splitted_text[i] in config['convert']:
            splitted_text[i] = config['convert'][splitted_text[i]]

        if splitted_text[i] in ar_stopwords:
            splitted_text[i] = ''

    return " ".join(splitted_text)

def preprocess_text(text): 
    text = text[:250]
    text = remove_mentions(text)
    #Replace @username with empty string
    text = re.sub('@[^\s]+', ' ', text)
    text = replace_words(text)

    #Convert www.* or https?://* to " "
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)

    #Replace #word with word
    text = re.sub(r'#([^\s]+)', r'\1', text)
    # remove punctuations
    text= remove_punctuations(text)

    # normalize the text
    text= normalize_arabic(text)

    # remove repeated letters
    text=remove_repeating_char(text)

    # remove special letters
    text=remove_special(text)

  # Clean/Normalize Arabic Text
    text = clean_str(text)

  # remove english words
    text = keep_only_arabic(text)
  # stemming
    # text= stemmer.stem(text)
    if not text:
        text = ' '
    return text.strip()


df = pd.read_csv("1.txt", sep="\t")
df['clean_text']=df['Tweet Text'].progress_apply(preprocess_text)
df.head(15)

100%|██████████| 127797/127797 [00:13<00:00, 9576.32it/s]


Unnamed: 0,Tweet Text,clean_text
0,@All_Qatar_2022 الوضع السلبي جدا والقادم اسوا ...,الوضع السلبي جدا والقادم اسوا المثلين وقس ذلك
1,@1vnrc @LGBTQarabic ادوس عليك,ادوس
2,ظاهرة اختطاف الأطفال المسلمين أنتشرت بالتزامن ...,ظاهره اختطاف الاطفال المسلمين انتشرت بالتزامن ...
3,@1lIolp @shibl_alshibl @LGBTQarabic طيب ستر اف...,طيب ستر افتارك اول
4,الدول العربية ترفض تقبل #الشذوذ و #المثليين وه...,الدول العربيه ترفض تقبل الشذوذ المثلين وهذا ال...
5,@historydefined هسه يجون المثليين يمتبون me an...,هسه يجون المثلين يمتبون
6,@1lIolp @shibl_alshibl @LGBTQarabic محد سألك,محد سالك
7,@LGBTQarabic عقبال الأردن,عقبال الاردن
8,@LGBTQarabic Congrats,
9,اين جماعة الإخوان من كأس العالم الذي يقام في ق...,اين جماعه الاخوان كاس العالم يقام قطر سيكون ال...


In [9]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['clean_text'])

In [10]:
def get_similarity(df, vectors, keyword):
    traninput = vectorizer.transform([preprocess_text(keyword)])
    traninput = traninput.toarray()
    cosine_sim = cosine_similarity(traninput,vectors)

    df[keyword] = pd.DataFrame(cosine_sim).T[0]

    return df



In [13]:
for keyword in config['keywords']:
    df = get_similarity(df, vectors, keyword)
del df['clean_text']

In [14]:
sorted_df = df.sort_values(by=['المثليين'], ascending=False)
sorted_df

Unnamed: 0,Tweet Text,الشواذ,المثليين
111060,@Mo_h_J22 علم المثليين,0.000000,1.0
47516,@Zoozsaleh07 المثليين,0.000000,1.0
58247,#المثليين https://t.co/ccV0QYxAfH,0.000000,1.0
113890,@AlHadawi3 هذا من المثليين,0.000000,1.0
111020,@Mo_h_J22 علم المثليين,0.000000,1.0
...,...,...,...
18036,@lulllulllulllu يمه لا يجوك الفاندومات الثانيه...,0.102485,0.0
83425,@Bandar_AlAqeel والقتلة سوسيوباثيين ياحرام وال...,0.000000,0.0
83426,@Softlavindr @fasnibts @LGBTQarabic اذا لقيت د...,0.000000,0.0
83428,@mrooiy @fasnibts @LGBTQarabic انا م راح ابرر ...,0.000000,0.0
