In [None]:
#Install and Import Libraries
import pandas as pd
import numpy as np
import string
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import re
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from xgboost import XGBClassifier


# Ensure necessary NLTK data files downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Load SpaCy English model
nlp = spacy.load('en_core_web_sm')


# Load the dataset
data = pd.read_csv("/content/sample_data/AWS_Autonews_data.csv")


# Data Overview
print("Data Overview:")
print(data.head())
print(data.shape)
print(data.dtypes)

# 2. Identify missing values before the fix
print("\nMissing Values Before Fix:")
missing_values_before = data.isnull().sum()
print(missing_values_before)

# 3. Fix missing values
data['body'].fillna('', inplace=True)
data['image_link'].fillna('No image available', inplace=True)

# 4. Identify missing values after the fix
print("\nMissing Values After Fix:")
missing_values_after = data.isnull().sum()
print(missing_values_after)

# Clean and Correct `publish_date` Column
def parse_dates(date_str):
    if pd.isna(date_str):  # Handle NaT
        return None
    try:
        return datetime.strptime(date_str, '%A, %B %d, %Y')
    except (ValueError, TypeError):
        pass
    try:
        return pd.to_datetime(date_str, errors='coerce', dayfirst=True)
    except (ValueError, TypeError):
        pass
    return None

data['parsed_publish_date'] = data['publish_date'].apply(parse_dates)
data = data.dropna(subset=['parsed_publish_date'])

# Analyze word count in titles
data['title_word_count'] = data['heading'].apply(lambda x: len(str(x).split()))
print("\nWord Count Summary:\n", data['title_word_count'].describe())
plt.figure(figsize=(10, 6))
plt.hist(data['title_word_count'], bins=15, color='skyblue', edgecolor='black')
plt.title('Distribution of Word Count in Titles')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


# Define categories and keywords
categories = {
    'Vehicle Maintenance': ['maintenance', 'repair', 'service', 'oil change', 'tire rotation', 'garage'],
    'Auto Parts': ['parts', 'spare', 'brake', 'engine', 'transmission'],
    'Technology and Innovation': ['technology', 'innovation', 'ai', 'artificial intelligence', 'machine learning'],
    'Market and industry Trend': ['industry', 'news', 'market', 'trend', 'report'],
    'Training and Education': ['training', 'education', 'course', 'certification', 'workshop'],
    'Electric Vehicles': ['electric vehicle', 'ev', 'battery', 'charging', 'tesla'],
    'Autonomous Vehicles': ['autonomous', 'self-driving', 'driverless', 'robotics', 'ai'],
    'Recalls and Safety': ['recall', 'safety', 'accident', 'defect', 'warning'],
    'Customer and Dealership Service': ['customer service', 'dealership', 'service', 'experience', 'feedback'],
    'Events and Trade Shows': ['event', 'trade show', 'conference', 'expo', 'summit'],
    'Design and Engineering': ['design', 'engineering', 'aerodynamics', 'architecture', 'prototype'],
    'Marketing and Sales': ['marketing', 'sales', 'promotion', 'campaign', 'advertising'],
    'Brands and Manufacturers': ['brand', 'manufacturer', 'company', 'ford', 'gm'],
    'Vehicle Performance and Tuning': ['performance', 'tuning', 'horsepower', 'speed', 'dyno'],
    'Job': ['job', 'career', 'employment', 'hiring', 'vacancy'],
    'Insurance': ['insurance', 'policy', 'coverage', 'claim', 'premium'],
    'Manufacturing': ['manufacturing', 'production', 'factory', 'assembly', 'plant'],
    'Mot': ['mot', 'test', 'inspection', 'certificate', 'roadworthy'],
    'Customer Experience': ['customer experience', 'satisfaction', 'loyalty', 'service', 'feedback'],
    'Resale': ['resale', 'aftermarket', 'auction', 'resell', 'exchange']
}

# Text Preprocessing
stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return words

data['processed_body'] = data['body'].apply(preprocess_text)

# Categorize text based on keywords
def categorize_text(text, categories):
    text_set = set(text)
    category_scores = {}

    for category, keywords in categories.items():
        keyword_set = set(keywords)
        matches = text_set.intersection(keyword_set)
        category_scores[category] = len(matches)

    if category_scores:
        return max(category_scores, key=category_scores.get)
    else:
        return 'Uncategorized'

data['category'] = data['processed_body'].apply(lambda x: categorize_text(x, categories))

# Output a table to show the first 10 and last 10 categories and subcategories
category_table = data[['category', 'processed_body']].groupby('category').count().reset_index()

# Calculate the percentage of each category
# Calculate the percentage of each category and round to 10 decimal places
total_count = category_table['processed_body'].sum()
category_table['percentage'] = (category_table['processed_body'] / total_count * 100).round(1)

# Output the first 10 and last 10 categories and subcategories along with their percentages
print("First 10 Categories and Subcategories:")
print(category_table.head(10))
print("\nLast 10 Categories and Subcategories:")
print(category_table.tail(10))


# Topic Distribution Analysis
plt.figure(figsize=(12, 6))
sns.countplot(data['category'], order=data['category'].value_counts().index)
plt.title('Distribution of News Articles Across Categories')
plt.xlabel('Category')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.show()


# 8. Wordcloud for relevant keywords for filtering, including car brands

relevant_keywords = {
    "Electric Vehicles": ['electric vehicle', 'ev', 'battery', 'charging', 'tesla'],
    "Technology & Innovation": ["technology", "innovation", "ai", "software", "digital", "automation","machine learning","artificial intelligence"],
    "Auto Parts": ["parts", "spare", "brake", "engine", "transmission"],
    "Industry News": ["industry", "news", "market", "trend", "report"],
    "Brands & Manufacturers": [
        "brand", "manufacturer", "company", "production", "industry", "market",
        "toyota", "ford", "volvo", "kia", "honda", "bmw", "mercedes", "nissan",
        "hyundai", "audi", "tesla", "chevrolet", "jeep", "mazda", "subaru", "volkswagen"
    ],
    "MOT": ["mot", "inspection", "test", "certification"],
    "Sales": ["sales", "market", "dealership", "retail", "customer", "price"],
    "Vehicle Maintenance": ["maintenance", "service", "repair", "checkup", "diagnosis", "mechanic"],
    "Design & Engineering": ["design", "engineering", "aerodynamics", "materials", "manufacturing"],
    "Vehicle Performance": ["performance", "acceleration", "speed", "efficiency", "power", "handling"],
    "Education & Training": ["education", "training", "learning", "courses", "certification"],
    "Autonomous Vehicles": ["autonomous", "self-driving", "automation", "sensor", "lidar", "ai"],
    "Repair & Diagnosis": ["repair", "diagnosis", "fix", "mechanic", "maintenance"],
    "Organization": ["organization", "company", "management", "structure", "leadership"]
}

def filter_relevant_words(text, relevant_keywords):
    words = text.split()
    filtered_words = [word for word in words if any(word in keyword_list for keyword_list in relevant_keywords.values())]
    return ' '.join(filtered_words)

# Combine all relevant words from all categories
all_text = ' '.join([' '.join(words) for words in data['processed_body']])

# Filter the combined text using the relevant keywords
filtered_text = filter_relevant_words(all_text, relevant_keywords)

# Generate a word cloud for the filtered text
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(filtered_text)

# Plot the word cloud
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Wordcloud for All Relevant Categories')
plt.show()


# Distribution of News Articles Across Categories
plt.figure(figsize=(12, 6))
sns.countplot(data['category'], order=data['category'].value_counts().index)
plt.title('Distribution of News Articles Across Categories')
plt.xlabel('Category')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.show()

# Pie chart for target variable (category)
plt.figure(figsize=(12, 6))
category_counts = data['category'].value_counts()
plt.pie(category_counts, labels=None, autopct=None, startangle=140, colors=sns.color_palette("tab10"))
plt.title('Distribution of News Categories')
plt.axis('equal')
legend_labels = [f"{category}: {count} ({percentage:.1f}%)" for category, count, percentage in zip(category_counts.index, category_counts, 100*category_counts/category_counts.sum())]
plt.legend(legend_labels, loc='center right', bbox_to_anchor=(1.3, 0.6), fancybox=True, shadow=True)
plt.show()

# Temporal Trends Analysis
data['year_month'] = pd.to_datetime(data['parsed_publish_date'])
top_two_categories = data['category'].value_counts().index[:2]
temporal_data = data[data['category'].isin(top_two_categories)]
temporal_trends = temporal_data.groupby([temporal_data['year_month'].dt.to_period("M"), 'category']).size().unstack(fill_value=0)
temporal_trends.plot(kind='line', figsize=(14, 7), title='Temporal Trends of Top 2 Categories')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.show()

# Yearly Trends for Selected Categories
data['year'] = data['parsed_publish_date'].dt.year
categories_to_analyze = ['Auto Parts', 'Vehicle Maintenance']
filtered_data = data[data['category'].isin(categories_to_analyze)]
yearly_trends = filtered_data.groupby(['year', 'category']).size().unstack(fill_value=0)
yearly_trends.plot(kind='line', figsize=(14, 7), marker='o', title='Yearly Trends for Auto Parts and Vehicle Maintenance')
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.show()



import pandas as pd
from nltk import ngrams
from collections import Counter

# Function to get n-grams
def get_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Assuming 'processed_body' is the column with the cleaned and tokenized text data
data['bigrams'] = data['processed_body'].apply(lambda x: get_ngrams(x, 2))
data['trigrams'] = data['processed_body'].apply(lambda x: get_ngrams(x, 3))

# Flatten the list of bigrams and trigrams and count the most common ones
top_bigrams = Counter([item for sublist in data['bigrams'] for item in sublist]).most_common(10)
top_trigrams = Counter([item for sublist in data['trigrams'] for item in sublist]).most_common(10)

# Convert the top bigrams and trigrams to DataFrames for better display
bigram_df = pd.DataFrame(top_bigrams, columns=['Bigram', 'Count'])
trigram_df = pd.DataFrame(top_trigrams, columns=['Trigram', 'Count'])

# Display the tables
print("Top Bigrams:")
print(bigram_df)

print("\nTop Trigrams:")
print(trigram_df)



# Search for keywords and Tags within graphical interface
import ipywidgets as widgets
from IPython.display import display

# 9. Function to search for keywords with graphical interface
def search_articles(keyword, data):
    keyword = keyword.lower()
    return data[data['body'].str.contains(keyword, na=False)]

# Function to display search results
def display_search_results(keyword):
    results = search_articles(keyword, data)
    if not results.empty:
        print(f"Found {len(results)} articles containing the keyword '{keyword}':")
        # Display available columns
        display(results[['category', 'body']].head())  # Display top 5 results
    else:
        print(f"No articles found containing the keyword '{keyword}'.")

# Create a search box widget
search_box = widgets.Text(
    value='',
    placeholder='Type your search keyword here',
    description='Search Tags and Keywords:',
    disabled=False
)

# Function to handle search box input on pressing Enter
def on_search_submit(change):
    display_search_results(change.value)

# Attach the function to handle submission (Enter key press)
search_box.on_submit(on_search_submit)

# Display the search box
display(search_box)


# NLP News Similarities

# Jaccard Similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

similarity_matrix = pd.DataFrame(index=data['category'].unique(), columns=data['category'].unique(), dtype=float)

for cat1 in data['category'].unique():
    for cat2 in data['category'].unique():
        if cat1 != cat2:
            # Flatten the list of words before joining into a string
            set1 = set(' '.join([' '.join(words) for words in data[data['category'] == cat1]['processed_body']]).split())
            set2 = set(' '.join([' '.join(words) for words in data[data['category'] == cat2]['processed_body']]).split())
            similarity_matrix.at[cat1, cat2] = jaccard_similarity(set1, set2)

plt.figure(figsize=(14, 12))  # Enlarge the plot
sns.heatmap(similarity_matrix, annot=True, cmap="YlGnBu", linewidths=0.5, vmin=0, vmax=1)
plt.title('Jaccard Similarity Between Categories Based on Article Bodies')
plt.show()


# Cosine Similarity using TF-IDF
corpus = [' '.join([' '.join(words) for words in data[data['category'] == category]['processed_body']]) for category in data['category'].unique()]

vectorizer = TfidfVectorizer().fit_transform(corpus)
vectors = vectorizer.toarray()

cosine_sim_matrix = cosine_similarity(vectors)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=data['category'].unique(), columns=data['category'].unique())

plt.figure(figsize=(14, 12))
sns.heatmap(cosine_sim_df, annot=True, cmap="YlGnBu", linewidths=0.5, vmin=0, vmax=1)
plt.title('Cosine Similarity Between Categories Based on Article Bodies')
plt.show()


# TF-IDF Weighted Word2Vec Similarity Between Categories Based on Article Bodies

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

# Convert the list of words in 'processed_body' to a single string for each row
data['processed_body_str'] = data['processed_body'].apply(lambda x: ' '.join(x))

# Train a TF-IDF model on the entire corpus
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(data['processed_body_str'])

# Compute TF-IDF weighted word vectors for each category
category_vectors = {}
for category in data['category'].unique():
    category_docs = data[data['category'] == category]['processed_body_str']
    category_tfidf_matrix = tfidf.transform(category_docs)

    # Calculate the weighted average of the word vectors
    word_vectors = np.zeros((len(category_docs), tfidf_matrix.shape[1]))
    for i, doc in enumerate(category_docs):
        tfidf_weights = category_tfidf_matrix[i].toarray()
        word_vectors[i] = tfidf_weights

    category_vectors[category] = np.mean(word_vectors, axis=0)

# Compute similarity matrix
category_names = list(category_vectors.keys())
category_matrix = np.array([category_vectors[category] for category in category_names])
tfidf_word2vec_sim_matrix = cosine_similarity(category_matrix)
tfidf_word2vec_sim_df = pd.DataFrame(tfidf_word2vec_sim_matrix, index=category_names, columns=category_names)

# Plot the heatmap
plt.figure(figsize=(14, 12))  # Enlarge the plot
sns.heatmap(tfidf_word2vec_sim_df, annot=True, cmap="YlGnBu", linewidths=0.5, vmin=0, vmax=1)
plt.title('TF-IDF Weighted Word2Vec Similarity Between Categories Based on Article Bodies')
plt.show()


#Features Extraction and Models

from sklearn.preprocessing import LabelEncoder

from gensim.models import Word2Vec
import numpy as np


# Convert the list of words in 'processed_body' to a single string for each row
data['processed_body_str'] = data['processed_body'].apply(lambda x: ' '.join(x))

# Create the TF-IDF matrix
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data['processed_body_str'])

# Train a Word2Vec model on the tokenized text
sentences = data['processed_body'].tolist()
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create a function to compute the average Word2Vec vector for each document
def document_vector(doc):
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    return np.mean(word2vec_model.wv[doc], axis=0) if doc else np.zeros(100)

# Apply the function to each document in 'processed_body' to get the word embeddings
data['word_embeddings'] = data['processed_body'].apply(document_vector)

# Convert the embeddings into a 2D array
word_embeddings = np.array(data['word_embeddings'].tolist())

# Encode the categories into numeric labels
label_encoder = LabelEncoder()
data['category_label'] = label_encoder.fit_transform(data['category'])

# Define target
y = data['category_label']

# Split data into training and testing sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_word2vec, X_test_word2vec = train_test_split(word_embeddings, test_size=0.2, random_state=42)


# Machine Learning and Hybrid Model Results

# Define models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'SVC': SVC(random_state=42)
}

# Train and evaluate models with each feature
results = {}

for model_name, model in models.items():
    for feature_name, (X_train, X_test) in {
        'TF-IDF': (X_train_tfidf, X_test_tfidf),
        'Word2Vec': (X_train_word2vec, X_test_word2vec)
    }.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred)
        results[f'{model_name}_{feature_name}'] = {'accuracy': acc, 'f1_score': f1, 'confusion_matrix': cm}

# Visualization of results
df_results = pd.DataFrame(results).T.reset_index()
df_results[['model', 'feature']] = df_results['index'].str.split('_', expand=True)

plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='accuracy', hue='feature', data=df_results)
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='f1_score', hue='feature', data=df_results)
plt.title('Model F1 Score Comparison')
plt.xlabel('Model')
plt.ylabel('F1 Score')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.show()

# Confusion matrices for the best models
best_models = sorted(results.items(), key=lambda item: item[1]['f1_score'], reverse=True)[:3]

for model, metrics in best_models:
    cm = metrics['confusion_matrix']
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.title(f'Confusion Matrix for {model}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

print("Best Models by F1 Score:")
for model, metrics in best_models:
    print(f"{model}: F1 Score = {metrics['f1_score']}, Accuracy = {metrics['accuracy']}")




