## import libraries

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Download the pre-trained Word2Vec model
w2v_model = api.load('word2vec-google-news-300')

[nltk_data] Downloading package punkt to C:\Users\HP
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Text pre-processing

In [2]:
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    text = ' '.join(filtered_words)
    return text

## stemming 
<br>
stemming:-Stemming: remove prefix and suffix <br>
-e.g.: Original Word: "running" <br>
Stem: "run" <br>
Stemming is not accurate, may create non-words, but it's fast

In [3]:
def apply_stemming(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    text = ' '.join(stemmed_words)
    return text

## lemmatization
<br>
Lemmatization: reducing words to their base or dictionary form. <br>
-e.g.: Original Word: "better" <br>
Lemma: "good" <br>
lemmatization is more accurate, returns real words but it's slow.

In [4]:
def apply_lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(lemmatized_words)
    return text

## w2v_model
<br>
numerical representations of words in a high-dimensional vector space where words with similar meanings are closer to each other.
<br>
- Word: King (vector: [0.8, 0.5, 0.3]) <br>
- Word: Queen (vector: [0.75, 0.45, 0.25]) (Notice the closeness) <br>
- Word: Dog (vector: [0.2, 0.7, 0.1]) (Further away due to different meaning) <br>


In [5]:
def get_text_vector(text):
    word_embeddings = []
    for word in text.split():
        try:
            embedding = w2v_model[word]
            word_embeddings.append(embedding)
        except KeyError:
            # If the word is not found in the vocabulary, skip it
            pass
    
    if len(word_embeddings) == 0:
        return None
    
    # Calculate the average of word embeddings
    text_vector = np.mean(word_embeddings, axis=0)
    return text_vector

## Cosine similarity
<br>
ما بعرف شو هي :)

In [6]:
def cosine_similarity_between_texts(text1, text2, limm=True):
    # Preprocess the texts
    if limm == True:
        preprocessed_text1 = apply_lemmatization(preprocess_text(text1))
        preprocessed_text2 = apply_lemmatization(preprocess_text(text2))
    else:
        preprocessed_text1 = apply_stemming(preprocess_text(text1))
        preprocessed_text2 = apply_stemming(preprocess_text(text2))
    
    # Get text vectors
    vector1 = get_text_vector(preprocessed_text1)
    vector2 = get_text_vector(preprocessed_text2)
    
    if vector1 is None or vector2 is None:
        return None
    
    # Calculate cosine similarity
    similarity = cosine_similarity([vector1], [vector2])[0][0]
    return similarity

### Trying consine similarity

In [7]:
# Example texts
text1 = "Tokenization is an important step in natural language processing."
text2 = "Sentence tokenization is a technique used to split text into sentences."

# Calculate cosine similarity between texts
similarity = cosine_similarity_between_texts(text1, text2)

print("Cosine Similarity between Text 1 and Text 2 with lemmatization:", similarity)
similarity = cosine_similarity_between_texts(text1, text2, False)
print("Cosine Similarity between Text 1 and Text 2 with stemming:", similarity)

Cosine Similarity between Text 1 and Text 2 with lemmatization: 0.49738303
Cosine Similarity between Text 1 and Text 2 with stemming: 0.46470538


## Connecting to MySQL database

In [8]:
import mysql.connector
db = mysql.connector.connect(user='root', password='Jana2003?',
                              host='127.0.0.1', database='grad',
                              auth_plugin='mysql_native_password')
cursor = db.cursor()

## عشان نجيب كل النوتس الي بالداتابيس ما عدا النوتس تبعت اليوزر يلي بدنا نعمله ريكومند + يلي عاملهم فلو 

## To retrieve all notes except those belonging to the user for whom we wish to recommend posts and those who does follow them

In [39]:
def get_all_users_note_except_user_and_followed(username_to_exclude):
    # Fetch the list of users that the specified user follows
    cursor.execute(f"""
        SELECT following_id
        FROM followers
        WHERE follower_id = '{username_to_exclude}'
    """)
    followed_users = cursor.fetchall()
    
    # Extract the list of followed user IDs
    followed_user_ids = [user[0] for user in followed_users]
    
    # Convert the list to a string format suitable for SQL IN clause
    followed_user_ids_str = "', '".join(followed_user_ids)
    
    # Include the specified user in the exclusion list
    followed_user_ids_str = f"'{username_to_exclude}', '{followed_user_ids_str}'"
    
    # Fetch notes excluding those from the specified user and followed users
    query = f"""
        SELECT * FROM note
        WHERE user_id NOT IN ({followed_user_ids_str})
    """
    cursor.execute(query)
    notes = cursor.fetchall()
    return notes

### testing the method

In [48]:
all_users_post_except_specific_user = get_all_users_note_except_user_and_followed("aaa")
for post in all_users_post_except_specific_user:
    print(post[2])

aaaa
a
aaaaa
saraa
saraa
saraa
saraa
saraa
a
a


## عشان نجيب النوتس تبعت اليوزر يلي بدنا نعمله ريكومند
## To fetch all the user's notes from the database in order to recommend posts for them based on those notes.

In [42]:
def get_all_user_posts(username):
    cursor.execute(f"SELECT text FROM note WHERE user_id = '{username}'")
    posts = cursor.fetchall()
    # Convert each tuple to a string
    posts_as_strings = [post[0] for post in posts]
    return posts_as_strings

posts = get_all_user_posts("aaa")
print(posts[0], " ", posts[1])
cosine_similarity_between_texts(posts[0], posts[1])

this is an updated text3!   this is a text from a url


0.2628655

## Compine all previous method to recommend user notes

In [45]:
def recommend_user_posts_using_cosine(user_name, number_of_posts):
    user_posts = get_all_user_posts(user_name)

    # Get all other users' posts
    other_users_posts = get_all_users_note_except_user_and_followed(user_name)

    # Initialize recommendations list
    recommendations = []

    # Initialize a set to store the IDs of processed posts
    processed_post_ids = set()

    # Loop through each other user's post
    for post in other_users_posts:
        # Calculate cosine similarity between each user post and the current post
        for user_post in user_posts:
            similarity = cosine_similarity_between_texts(user_post, post[3])
            if similarity and post[0] not in processed_post_ids:  # Check if similarity is not None and post not processed
                recommendations.append({
                    'id': post[0],              # Get the post ID from the tuple
                    'creation_time': post[1],   # Get the creation time from the tuple
                    'user_id': post[2],         # Get the user ID from the tuple
                    'text': post[3],            # Get the post text from the tuple
                    'url': post[4],             # Get the URL from the tuple
                    'comment': post[5],         # Get the comment from the tuple
                    'similarity': similarity    # Include the calculated similarity
                })
                # Add the post ID to the set of processed post IDs
                processed_post_ids.add(post[0])

    # Sort recommendations by similarity (highest first)
    recommendations.sort(key=lambda x: x['similarity'], reverse=True)

    # Return only the requested number of recommendations
    return recommendations[:number_of_posts]

### Testing the method

In [46]:
recommend_user_posts_using_cosine("aaa",5)

[{'id': 1,
  'creation_time': datetime.datetime(2024, 4, 13, 18, 13, 29, 89000),
  'user_id': 'aaaa',
  'text': 'this is an updated text2!',
  'url': 'www.hello.com',
  'comment': "this is jaaa's comment",
  'similarity': 1.0},
 {'id': 4,
  'creation_time': datetime.datetime(2024, 4, 13, 18, 23, 35, 291000),
  'user_id': 'a',
  'text': 'this is a text from a url',
  'url': 'www.hello.com',
  'comment': "this is a's comment",
  'similarity': 0.2628655},
 {'id': 6,
  'creation_time': datetime.datetime(2024, 4, 16, 20, 55, 6, 597000),
  'user_id': 'saraa',
  'text': 'this is a text from a url',
  'url': 'www.hello.com',
  'comment': "this is a's comment",
  'similarity': 0.2628655},
 {'id': 7,
  'creation_time': datetime.datetime(2024, 4, 16, 20, 55, 8, 120000),
  'user_id': 'saraa',
  'text': 'this is a text from a url',
  'url': 'www.hello.com',
  'comment': "this is a's comment",
  'similarity': 0.2628655},
 {'id': 8,
  'creation_time': datetime.datetime(2024, 4, 16, 20, 55, 9, 448000)

## Euclidean similarity
<br>
ما بعرف شو هي :)

In [14]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def euclidean_similarity_between_texts(text1, text2, limm=True):
    # Preprocess the texts
    if limm == True:
        preprocessed_text1 = apply_lemmatization(preprocess_text(text1))
        preprocessed_text2 = apply_lemmatization(preprocess_text(text2))
    else:
        preprocessed_text1 = apply_stemming(preprocess_text(text1))
        preprocessed_text2 = apply_stemming(preprocess_text(text2))
    
    # Get text vectors
    vector1 = get_text_vector(preprocessed_text1)
    vector2 = get_text_vector(preprocessed_text2)
    
    if vector1 is None or vector2 is None:
        return None
    
    # Calculate Euclidean distance
    distance = euclidean_distances([vector1], [vector2])[0][0]
    
    # Convert distance to similarity
    similarity = 1 / (1 + distance)
    
    return similarity

## Compine all previous method to recommend user notes

In [15]:
def recommend_user_posts_using_euclidean_distances(user_name, number_of_posts):
    
    user_posts = get_all_user_posts(user_name)

    # Get all other users' posts
    other_users_posts = get_all_users_note_except_user(user_name)

    # Initialize recommendations list
    recommendations = []

    # Loop through each other user's post
    for post in other_users_posts:
    # Calculate cosine similarity between each user post and the current post
        for user_post in user_posts:
            similarity = euclidean_similarity_between_texts(user_post, post[0])
            if similarity:  # Check if similarity is not None
                recommendations.append({
                  'id': post[1],  # Get the post ID from the tuple
                  'text': post[0],  # Get the post text from the tuple
                  'similarity': similarity
                })

    # Sort recommendations by similarity (highest first)
    recommendations.sort(key=lambda x: x['similarity'], reverse=True)

    # Return only the requested number of recommendations
    return recommendations[:number_of_posts]

### Testing the method

In [16]:
recommend_user_posts_using_euclidean_distances("aaaa",5)

[{'id': 12, 'text': 'this is an updated text3!', 'similarity': 1.0},
 {'id': 15, 'text': 'Google Chrome', 'similarity': 0.22105131955806642},
 {'id': 4,
  'text': 'this is a text from a url',
  'similarity': 0.2133374349746918},
 {'id': 6,
  'text': 'this is a text from a url',
  'similarity': 0.2133374349746918},
 {'id': 7,
  'text': 'this is a text from a url',
  'similarity': 0.2133374349746918}]

# Dot Product

In [17]:
def dot_product_similarity_between_texts(text1, text2, limm=True):
    # Preprocess the texts
    if limm == True:
        preprocessed_text1 = apply_lemmatization(preprocess_text(text1))
        preprocessed_text2 = apply_lemmatization(preprocess_text(text2))
    else:
        preprocessed_text1 = apply_stemming(preprocess_text(text1))
        preprocessed_text2 = apply_stemming(preprocess_text(text2))
    
    # Get text vectors
    vector1 = get_text_vector(preprocessed_text1)
    vector2 = get_text_vector(preprocessed_text2)
    
    if vector1 is None or vector2 is None:
        return None
    
    # Calculate dot product
    dot_product = np.dot(vector1, vector2)
    
    # Normalize vectors
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    
    # Calculate similarity
    similarity = dot_product / (norm_vector1 * norm_vector2)
    
    return similarity

In [18]:
def recommend_user_posts_using_dot_product(user_name, number_of_posts):
    
    user_posts = get_all_user_posts(user_name)

    # Get all other users' posts
    other_users_posts = get_all_users_note_except_user(user_name)

    # Initialize recommendations list
    recommendations = []

    # Loop through each other user's post
    for post in other_users_posts:
    # Calculate cosine similarity between each user post and the current post
        for user_post in user_posts:
            similarity = dot_product_similarity_between_texts(user_post, post[0])
            if similarity:  # Check if similarity is not None
                recommendations.append({
                  'id': post[1],  # Get the post ID from the tuple
                  'text': post[0],  # Get the post text from the tuple
                  'similarity': similarity
                })

    # Sort recommendations by similarity (highest first)
    recommendations.sort(key=lambda x: x['similarity'], reverse=True)

    # Return only the requested number of recommendations
    return recommendations[:number_of_posts]

In [19]:
recommend_user_posts_using_dot_product("aaaa",5)

[{'id': 12, 'text': 'this is an updated text3!', 'similarity': 1.0000001},
 {'id': 4, 'text': 'this is a text from a url', 'similarity': 0.2628655},
 {'id': 6, 'text': 'this is a text from a url', 'similarity': 0.2628655},
 {'id': 7, 'text': 'this is a text from a url', 'similarity': 0.2628655},
 {'id': 8, 'text': 'this is a text from a url', 'similarity': 0.2628655}]