In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

# Load the Excel data
file_path = '/content/drive/MyDrive/SLT/Preprocessed data.xlsx'  # Replace with your file path
data = pd.read_excel(file_path)

# Display the first few rows of the data
print(data.head())
content_to_label = {
    'sport': 0,
    'technologie': 1,
    'business': 2,
    'graphics': 3,
    'entertainment': 4,
    'politics': 5,
    'food': 6,
    'historical': 7,
    'medical': 8,
    'space': 9
}

# Convert content categories to numerical labels
data['label'] = data['Category'].map(content_to_label)

# Handling missing values in the label column
data.dropna(subset=['label'], inplace=True)

# Define lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Preprocess text data
def preprocess_text(text):
    # Remove non-alphabetic characters and lowercase
    text = re.sub("[^a-zA-Z]", " ", str(text)).lower()

    # Tokenization
    words = text.split()

    # Remove stop words, lemmatize, and stem
    stop_words = set(stopwords.words("english"))
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]

    return " ".join(words)

# Apply preprocessing to the 'Text' column
data['Processed_Text'] = data['Text'].apply(preprocess_text)

# Print the number of total documents before dropping
total_docs_before = len(data)
print(f"Number of Total Documents Before: {total_docs_before}")

# Print the number of duplicates before dropping
num_duplicates_before = data.duplicated(subset='Processed_Text').sum()
print(f"Number of Duplicates Before: {num_duplicates_before}")

# Drop duplicates based on the 'Processed_Text' column
data = data.drop_duplicates(subset='Processed_Text', keep='first')

# Print the number of total documents after dropping
total_docs_after = len(data)
print(f"Number of Total Documents After: {total_docs_after}")

# Print the number of duplicates after dropping
num_duplicates_after = data.duplicated(subset='Processed_Text').sum()
print(f"Number of Duplicates After: {num_duplicates_after}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Processed_Text'], data['label'], test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train an SVM classifier
svm_classifier = SVC(kernel='linear')  # You can experiment with different kernels (linear, rbf, etc.)
svm_classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print(classification_report(y_test, y_pred))

# Find the number of documents in each content category
content_counts = data['Category'].value_counts()
print("\nNumber of documents in each content category:")
print(content_counts)


                                                Text  Category
0  Lufthansa flies back to profit.German airline ...  business
1  Japanese growth grinds to a halt\n\nGrowth in ...  business
2  WorldCom director admits lying\n\nThe former c...  business
3  Glaxo aims high after profit fall\n\nGlaxoSmit...  business
4  Peugeot deal boosts Mitsubishi\n\nStruggling J...  business
Number of Total Documents Before: 999
Number of Duplicates Before: 19
Number of Total Documents After: 980
Number of Duplicates After: 0
Accuracy: 0.96
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      0.93      0.96        14
           2       1.00      0.95      0.97        20
           3       0.96      1.00      0.98        26
           4       1.00      1.00      1.00        12
           5       0.92      1.00      0.96        23
           6       1.00      1.00      1.00        20
           7       1.00      1.00    