<a href="https://colab.research.google.com/github/JavariaTanveer/CCN-Codes/blob/main/svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**SVM**

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

# Function for text cleaning
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove all special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove all single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters from the start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with single space
    text = re.sub(r'^b\s+', '', text)  # Remove prefixed 'b'
    text = text.lower()  # Convert to lowercase
    return text

# Upload the dataset file
from google.colab import files
uploaded = files.upload()

# Load the dataset from the uploaded file
df = pd.read_excel(list(uploaded.keys())[0])

# Clean the text data
df['text'] = df['text'].apply(lambda x: clean_text(str(x)))

# Check for any missing values
df.isnull().sum()

# Fill missing values with empty strings
df.fillna('', inplace=True)

# Define the feature (X) and the target (y)
X = df['text']
y = df['label']  # Assuming the labels are in the 'label' column

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF with ngrams
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the SVM classifier with hyperparameter tuning
svm_classifier = SVC()

# Use a simpler grid search to find the best hyperparameters
parameters = {
    'C': [1, 10],
    'gamma': [0.1, 0.01],
    'kernel': ['linear']
}

grid_search = GridSearchCV(svm_classifier, parameters, cv=3, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Get the best model
best_svm_classifier = grid_search.best_estimator_

# Train the classifier
best_svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = best_svm_classifier.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Saving dataset.xlsx to dataset.xlsx
Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.952
Confusion Matrix:
[[139   0   4]
 [  0 451  36]
 [  0  32 838]]
Classification Report:
              precision    recall  f1-score   support

        fake       1.00      0.97      0.99       143
       fake        0.93      0.93      0.93       487
        real       0.95      0.96      0.96       870

    accuracy                           0.95      1500
   macro avg       0.96      0.95      0.96      1500
weighted avg       0.95      0.95      0.95      1500

