# Data Preprocessing

In [1]:
# Data cleaning
# 1. remove: tweet id, tweet date from the dataset

In [60]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import skfuzzy as fuzz
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
input_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//cnnhealth.txt' 
output_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets.xlsx' 

# List to store the extracted content
content_list = []

# Read and process the file
with open(input_file, 'r', encoding='utf-8') as file:
    for line in file:
        if '|' in line:  # Check if the line contains '|'
            parts = line.split('|', 2)  # Split into parts
            if len(parts) > 1:  # Ensure it has at least two parts
                content = parts[2].strip()  # Extract the content part
                if content:  # Avoid empty lines
                    content_list.append(content)

# Create a DataFrame and save to Excel
df = pd.DataFrame(content_list, columns=['Tweet Content'])
df.to_excel(output_file, index=False)

print(f"Extracted tweets have been saved to {output_file}")


Extracted tweets have been saved to C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets.xlsx


In [4]:
data = pd.read_excel(output_file)

In [5]:
data.head()

Unnamed: 0,Tweet Content
0,An abundance of online info can turn us into e...
1,A plant-based diet that incorporates fish may ...
2,It doesn't take much to damage your hearing at...
3,RT @CNN: Forever young? Discover this island’s...
4,RT @CNN: Is post-traumatic stress disorder in ...


In [6]:
# 2. remove any link or url(http:) from the content of each cell (because our dataset now in excel format)

In [7]:
def remove_urls(text):
    url_pattern = r'http\S+'  
    return re.sub(url_pattern, '', text).strip()

# Load the Excel file
input_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets.xlsx'
output_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_remove_url.xlsx'

# Read the data into a DataFrame
df = pd.read_excel(input_file)

# Apply the function to the 'Tweet Content' column
df['Tweet Content'] = df['Tweet Content'].apply(remove_urls)

# Save the cleaned data back to a new Excel file
df.to_excel(output_file, index=False)

print(f"Cleaned tweets have been saved to {output_file}")

Cleaned tweets have been saved to C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_remove_url.xlsx


In [8]:
dataset = pd.read_excel('C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_remove_url.xlsx')

In [9]:
dataset.head()

Unnamed: 0,Tweet Content
0,An abundance of online info can turn us into e...
1,A plant-based diet that incorporates fish may ...
2,It doesn't take much to damage your hearing at...
3,RT @CNN: Forever young? Discover this island’s...
4,RT @CNN: Is post-traumatic stress disorder in ...


In [None]:
# 3. remove Patterns : @,/,\,;,:,'',"",*,?-

In [14]:
def clean_text(text):
    
    special_char_pattern = r'[#/@?\'`;:\\!*+_"-]'  # Pattern to remove special characters
    text = re.sub(special_char_pattern, '', text)  # Remove special characters
    return text.strip()

# File paths
input_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_remove_url.xlsx'
output_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_remove_special_chars.xlsx'

# Read the data
df = pd.read_excel(input_file)

# Clean the 'Tweet Content' column
df['Tweet Content'] = df['Tweet Content'].apply(clean_text)

# Save the cleaned data
df.to_excel(output_file, index=False)

print(f"Cleaned tweets have been saved to {output_file}")

Cleaned tweets have been saved to C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_remove_special_chars.xlsx


In [15]:
dataset2 = pd.read_excel(output_file)

In [None]:
dataset2.head()

#  tweets treatment: NLP (Natural LanguageProcessing) 

In [None]:
# 4. NLTK process

In [24]:
nltk.download('stopwords') 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english')) 
stemmer = PorterStemmer()  
lemmatizer = WordNetLemmatizer()  

def nlp_processing(tweet):
    # 1. Convertir en minuscules
    tweet = tweet.lower()
    # 2. Tokenisation
    tokens = word_tokenize(tweet)
    # 3. Suppression des mots vides
    tokens = [word for word in tokens if word not in stop_words]
    # 4. Racisation (Stemming)
    tokens = [stemmer.stem(word) for word in tokens]
    # 5. Lemmatisation
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Retourner les tokens nettoyés sous forme de chaîne
    return ' '.join(tokens)

# Lecture du fichier Excel contenant les tweets
input_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_remove_special_chars.xlsx'
output_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_cleaned.xlsx'

# Charger les tweets dans un DataFrame
df = pd.read_excel(input_file)

# Appliquer le traitement NLP à chaque tweet
df['Processed Tweet Content'] = df['Tweet Content'].apply(nlp_processing)

# Sauvegarder les résultats dans un nouveau fichier Excel
df.to_excel(output_file, index=False)

print(f"Les tweets traités ont été sauvegardés dans {output_file}")


[nltk_data] Downloading package stopwords to C:\Users\PC-
[nltk_data]     Mayssa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\PC-
[nltk_data]     Mayssa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\PC-
[nltk_data]     Mayssa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\PC-
[nltk_data]     Mayssa\AppData\Roaming\nltk_data...


Les tweets traités ont été sauvegardés dans C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_cleaned.xlsx


In [25]:
data = pd.read_excel('C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_cleaned.xlsx')

In [26]:
data.head()

Unnamed: 0,Tweet Content,Processed Tweet Content
0,An abundance of online info can turn us into e...,"abund onlin info turn u ehypochondriac . , wor..."
1,A plantbased diet that incorporates fish may b...,plantbas diet incorpor fish may key prevent co...
2,It doesnt take much to damage your hearing at ...,doesnt take much damag hear sport bar nightclu...
3,RT CNN Forever young Discover this island’s se...,rt cnn forev young discov island ’ secret long...
4,RT CNN Is posttraumatic stress disorder in you...,rt cnn posttraumat stress disord gene simpl bl...


In [27]:
# 5. copy only the second column in new excel file to apply after the clustering algorithms

input_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//tweets_cleaned.xlsx'
output_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//final_tweets.xlsx'

# Load the processed Excel file
df = pd.read_excel(input_file)

# Select only the 'Processed Tweet Content' column
processed_tweets = df[['Processed Tweet Content']]

# Save to a new Excel file
processed_tweets.to_excel(output_file, index=False)

print(f"The column 'Processed Tweet Content' has been saved to {output_file}")

The column 'Processed Tweet Content' has been saved to C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//final_tweets.xlsx


In [30]:
dataset_final = pd.read_excel('C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//final_tweets.xlsx')

dataset_final.head()

# tweets classification

In [35]:
# 6. apply clustering algorithms 
# 6.1 kmeans

In [54]:
def jaccard_distance(X, Y):
    intersection = np.array([len(set(x.split()) & set(y.split())) for x, y in zip(X, Y)])
    union = np.array([len(set(x.split()) | set(y.split())) for x, y in zip(X, Y)])
    return 1 - intersection / union

def k_means_clustering(data, k):
    # Convert text data to numerical data using TF-IDF (for tokenizing)
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(data).toarray()
    
    # Apply K-Means clustering (with custom distance metric)
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    
    # Calculate Jaccard distances
    jaccard_distances = pairwise_distances(X, metric='jaccard')
    
    # Compute SSE (Sum of Squared Errors using Jaccard distance)
    sse = np.sum(np.min(jaccard_distances, axis=1))  # Sum of distances of each point to its closest centroid
    
    # Count the number of tweets in each cluster
    unique, counts = np.unique(kmeans.labels_, return_counts=True)
    cluster_sizes = dict(zip(unique, counts))
    
    # Print results
    print(f"K = {k}")
    print(f"SSE = {sse:.3f}")
    for cluster, size in cluster_sizes.items():
        print(f"Cluster {cluster + 1}: {size} tweets")
        
        # Display one example tweet from each cluster
        cluster_indices = np.where(kmeans.labels_ == cluster)[0]
        example_tweet = data[cluster_indices[0]]  # First tweet in the cluster
        print(f"Example Tweet from Cluster {cluster + 1}: {example_tweet}\n")
    
    return kmeans

In [48]:
# 6.2 fuzzy c means

In [65]:
def jaccard_distance(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return 1 - (intersection / union)

def fuzzy_c_means_clustering(data, k):
    # Apply Fuzzy C-Means clustering (we'll compute the distances manually)
    n_samples = len(data)
    
    # Apply fuzzy c-means clustering using the vectorized representation of the data
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(data).toarray()
    
    # Fuzzy C-Means clustering using sklearn fuzzy c-means
    cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
        X.T, k, 2, error=0.005, maxiter=1000, init=None)
    
    # Now compute the Jaccard distances between the tweets and the cluster centers
    jaccard_distances = np.zeros((n_samples, k))
    for x in range(n_samples):
        for c in range(k):
            cluster_center_words = ' '.join([vectorizer.get_feature_names_out()[i] for i in range(len(cntr[c])) if cntr[c][i] > 0])
            jaccard_distances[x, c] = jaccard_distance(data[x], cluster_center_words)
    
    cluster_membership = np.argmin(jaccard_distances, axis=1)

    unique, counts = np.unique(cluster_membership, return_counts=True)
    cluster_sizes = dict(zip(unique, counts))
    sse = np.sum(np.min(jaccard_distances, axis=1))
    
    print(f"K = {k}")
    print(f"SSE = {sse:.3f}")
    
    for cluster, size in cluster_sizes.items():
        print(f"Cluster {cluster + 1}: {size} tweets")
        cluster_indices = np.where(cluster_membership == cluster)[0]
        example_tweet = data[cluster_indices[0]]  # First tweet in the cluster
        print(f"Example Tweet from Cluster {cluster + 1}: {example_tweet}\n")
    
    return cntr, u

In [56]:
# 7. apply them to our dataset (final_tweet)

In [57]:
input_file = 'C://Users//PC-Mayssa//Desktop//2MR//Data Mining//.ipynb_checkpoints//final_tweets.xlsx'
df = pd.read_excel(input_file)

# Get the processed tweets as a list
tweets = df['Processed Tweet Content'].tolist()

In [58]:
print(tweets)

['abund onlin info turn u ehypochondriac . , wors , lead u neglect get care need', 'plantbas diet incorpor fish may key prevent colorect cancer', 'doesnt take much damag hear sport bar nightclub . that billion peopl risk .', 'rt cnn forev young discov island ’ secret longev thewonderlist w billweircnn', 'rt cnn posttraumat stress disord gene simpl blood test may one day help tell', 'maysoon zayid , tour standup comic cerebr palsi , messag share .', 'woman wipe alzheim , mariashriv .', 'rt cnnopinion woman defeat alzheim , say mariashriv . wipeoutalz challeng make happen .', 'time rais legal smoke age', 'cdc misus garment may led releas bioterror bacteria tulan monkey lab .', 'lose brain tumor , gain perspect cnn jessica moskowitz firstperson experi .', 'may germ microb gene slip human dna , studi say .', 'rt cnn plantbas diet incorpor fish may key prevent colorect cancer', 'fitnat find right life balanc famili , work get fit .', 'robert downey jr. present child iron man robot arm .', '

In [66]:
print("K-Means Results:")
k_means_clustering(tweets, k=3)

# Apply Fuzzy C-Means
fuzzy_c_means_clustering(tweets, k=3)

K-Means Results:




K = 3
SSE = 0.000
Cluster 1: 3171 tweets
Example Tweet from Cluster 1: abund onlin info turn u ehypochondriac . , wors , lead u neglect get care need

Cluster 2: 259 tweets
Example Tweet from Cluster 2: everybodi sixpack . ab workout wont help reveal getfit

Cluster 3: 631 tweets
Example Tweet from Cluster 3: rt cnn forev young discov island ’ secret longev thewonderlist w billweircnn

K = 3
SSE = 4039.542
Cluster 1: 4061 tweets
Example Tweet from Cluster 1: abund onlin info turn u ehypochondriac . , wors , lead u neglect get care need



(array([[0.00213067, 0.00622714, 0.00294834, ..., 0.00183123, 0.00136654,
         0.00059431],
        [0.00213069, 0.00622734, 0.00294846, ..., 0.00183124, 0.00136658,
         0.00059429],
        [0.00213063, 0.00622724, 0.00294853, ..., 0.00183122, 0.00136659,
         0.00059434]]),
 array([[0.33333343, 0.33333373, 0.33333349, ..., 0.33333345, 0.33333337,
         0.33333396],
        [0.33333335, 0.33333327, 0.33333331, ..., 0.33333331, 0.33333343,
         0.33333325],
        [0.33333322, 0.333333  , 0.3333332 , ..., 0.33333324, 0.33333319,
         0.33333279]]))