Modelling the events using dask library

In [1]:
# Importing Libraries
import dask.dataframe as dd
import dask.array as da
import dask.delayed as delayed
import dask.bag as db
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.metrics.pairwise import cosine_similarity
import re

In [2]:
# Load CSV files into Dask dataframes
df_data = dd.read_csv("/media/moraa/New Volume/Ontita/10Academy/Cohort B/Projects/week0/Datasets/data.csv/rating.csv")
df_raw = dd.read_csv("/media/moraa/New Volume/Ontita/10Academy/Cohort B/Projects/week0/Datasets/raw_data/data.csv")
df_traffic = dd.read_csv("/media/moraa/New Volume/Ontita/10Academy/Cohort B/Projects/week0/Datasets/traffic_data/traffic.csv")
df_domains = dd.read_csv("/media/moraa/New Volume/Ontita/10Academy/Cohort B/Projects/week0/Datasets/domains_location.csv")

Handling Missing Values

In [3]:
# Calculating the percentage of missing values
missing_percentage = df_data.isnull().sum() / len(df_data) * 100
print(missing_percentage.compute())  # Compute the result to get the actual percentages

# Replace missing values in text columns with 'Unknown'
df_data['author'] = df_data['author'].fillna('Unknown')
df_data['description'] = df_data['description'].fillna('Not Available')
df_data['url_to_image'] = df_data['url_to_image'].fillna('Not Available')
df_data['category'] = df_data['category'].fillna('Unknown')

In [None]:
df_data.head().compute()

In [None]:
df_data.info()

In [None]:
df_data.shape[0].compute(), "rows,", df_data.shape[1], "columns"

Text preprocessing

Keyword extraction/modelling using TF-IDF

In [None]:
# Download NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Function to preprocess text
def preprocess_text(text):
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove emoticons
    text = re.sub(r':\)|;\)|:-\)|:-\(|:-D|:-\(|:-\)|:D|:P|:S|:\||:O|:\(|:\-D|:\-S', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Function to perform TF-IDF vectorization and keyword extraction
def extract_keywords(df_data):
    # Apply text preprocessing to 'content' column
    df_data['clean_content'] = df_data['content'].apply(preprocess_text)
    
    # Perform TF-IDF vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df_data['clean_content'])
    
    # Delayed function to compute TF-IDF matrix and feature names
    @delayed
    def compute_tfidf():
        # Convert TF-IDF matrix to Dask array
        tfidf_array = da.from_array(tfidf_matrix)
        
        # Get feature names (words)
        feature_names = tfidf_vectorizer.get_feature_names_out()
        return tfidf_array, feature_names
    
    # Compute TF-IDF matrix and feature names
    tfidf_array, feature_names = compute_tfidf().compute()
    
    # Create a DataFrame from TF-IDF array
    tfidf_df = dd.from_dask_array(tfidf_array, columns=feature_names)
    
    # Identify top keywords
    top_keywords = tfidf_df.sum().nlargest(10)
    return top_keywords

# Sample data preprocessing and keyword extraction
top_keywords = extract_keywords(df_data)

# Display top keywords
print("Top Keywords:", top_keywords.compute())

Similarity Search

In [None]:
# Convert values in the 'title' column to strings only if they are convertible
df_data['title'] = df_data.map_partitions(lambda df: df['title'].astype(str), meta=('title', 'str'))

In [None]:
# Function to calculate cosine similarity between headline/title and news body for a subset of articles
def calculate_similarity_subset(df_data, sample_size=1000):
    # Sample subset of articles
    df_sample = df_data.sample(frac=sample_size/len(df_data), random_state=42)
    
    # Apply text preprocessing to 'title' and 'content' columns
    df_sample['clean_title'] = df_sample['title'].map_partitions(preprocess_text)
    df_sample['clean_content'] = df_sample['content'].map_partitions(preprocess_text)
    
    # Delayed function to compute TF-IDF matrix and cosine similarity
    @delayed
    def compute_similarity(df_sample):
        # Perform TF-IDF vectorization for title
        tfidf_vectorizer_title = TfidfVectorizer(max_features=1000)
        tfidf_matrix_title = tfidf_vectorizer_title.fit_transform(df_sample['clean_title'])
        
        # Perform TF-IDF vectorization for content
        tfidf_vectorizer_content = TfidfVectorizer(max_features=1000)
        tfidf_matrix_content = tfidf_vectorizer_content.fit_transform(df_sample['clean_content'])
        
        # Calculate cosine similarity between title and content vectors
        similarity_scores = cosine_similarity(tfidf_matrix_title, tfidf_matrix_content)
        return similarity_scores
    
    # Compute similarity scores for each partition
    similarity_scores = compute_similarity(df_sample)
    
    # Aggregate similarity scores across partitions
    overall_similarity = da.from_delayed(similarity_scores)
    return overall_similarity.mean().compute()

# Calculate similarity scores for a subset of articles
overall_similarity_subset = calculate_similarity_subset(df_data)

print("Overall similarity between keywords in headline/title and news body across sites (subset):", overall_similarity_subset)