In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


In [None]:
import pandas as pd

# Define the path to the dataset file in your Google Drive
file_path = "/content/drive/MyDrive/ColabNotebooks/CODSOFT/SPAM_SMS_DETECTION/spam.csv"



# Load the dataset into a pandas DataFrame
dataset = pd.read_csv(file_path,encoding='latin1')

# Display the first few rows of the dataset
dataset.head()


In [None]:
dataset.describe()
display(dataset.columns,dataset.shape)
#dataset.value_counts()

In [None]:
dataset.info()

In [None]:
import pandas as pd

def check_missing_values(df):
    """
    Function to check for missing values in a dataset.

    Parameters:
    - dataset: pandas DataFrame object representing the dataset.

    Returns:
    - missing_values: pandas Series containing the count of missing values for each column.
    """
    # Count the missing values for each column
    missing_values = dataset.isnull().sum()
    print(missing_values)

check_missing_values(dataset)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_missing_value_percentage(df):
    """
    Function to print the percentage of missing values for each column in the dataset
    and show a bar plot to visualize the missing value percentages.

    Parameters:
    - dataset: pandas DataFrame object representing the dataset.
    """

    # Calculate the percentage of missing values for each column
    missing_percentage = (df.isnull().sum() / len(df)) * 100

    # Plot the missing value percentages
    plt.figure(figsize=(10, 6))
    missing_percentage.plot(kind='bar', color='red')
    plt.title('Percentage of Missing Values in Each Column')
    plt.xlabel('Columns')
    plt.ylabel('Percentage of Missing Values')
    plt.xticks(rotation=0, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # Print column-wise missing value percentages
    print("Column-wise missing value percentages:")
    for column, percentage in missing_percentage.items():
        print(f"{column}: {percentage:.2f}%")

plot_missing_value_percentage(dataset)


In [None]:
import pandas as pd

def drop_columns_with_high_missing_percentage(df, threshold=90):
    """
    Function to drop columns from the dataset if the percentage of missing values
    in those columns is greater than the specified threshold.

    Parameters:
    - dataset: pandas DataFrame object representing the dataset.
    - threshold: Percentage threshold for missing values (default is 90%).

    Returns:
    - dataset_after_dropping: pandas DataFrame object with columns dropped.
    """

    # Calculate the percentage of missing values for each column
    missing_percentage = (df.isnull().sum() / len(df)) * 100

    # Identify columns with missing percentage greater than the threshold
    columns_to_drop = missing_percentage[missing_percentage > threshold].index

    # Drop the identified columns from the dataset
    dataset_after_dropping = df.drop(columns=columns_to_drop)

    # Print columns dropped
    print("Columns dropped due to high missing percentage (> {}%):".format(threshold))
    print(columns_to_drop)

    return dataset_after_dropping


# Drop columns with high missing percentage (> 90%) from the dataset
dataset = drop_columns_with_high_missing_percentage(dataset)
dataset.head()


In [None]:
def print_column_value_counts(df):
  print("Printing Value Counts")
  i=1
  for column in list(df.columns):
      print(f"Column {i}:{column}")
      print("-"*30)
      print(f"{df[column].value_counts()}")
      print("-"*30)
      i+=1
print_column_value_counts(dataset)
#dataset["v2"].value_counts()

In [None]:
# def remove_duplicates(df):
#     # Print duplicates
#     duplicate_rows = df[df.duplicated()]
#     if not duplicate_rows.empty:
#         print("Duplicate Rows:")
#         print(duplicate_rows)
#     else:
#         print("No duplicate rows found.")

#     # Remove duplicates
#     df.drop_duplicates(inplace=True)

#     print("Duplicates removed.")
#     return df
# dataset = remove_duplicates(dataset)


In [None]:
def encode_target_variable(df, target_column):
    # Create a copy of the dataset
    encoded_dataset = df.copy()

    # Encode the target variable
    encoded_dataset[target_column] = encoded_dataset[target_column].map({"ham": 0, "spam": 1})

    return encoded_dataset

# Encode the target variable
dataset = encode_target_variable(dataset, target_column="v1")
dataset.head()


In [None]:
 dataset.rename(columns={"v1": "target","v2":"messages"}, inplace=True)
 dataset.info()

In [None]:
dataset

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation(df):
    # Calculate correlation matrix
    corr_matrix = df.corr()

    # Plot correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
    plt.title('Correlation Matrix')
    plt.show()

# Example usage:
# Assuming 'dataset' is the DataFrame containing your dataset
# You can call the function like this:
# plot_correlation(dataset)


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

def preprocessor(df):
    # Lowercasing
    df['cleaned_message'] = df['messages'].apply(lambda x: x.lower())
    # Punctuation Removal
    df['cleaned_message'] = df['cleaned_message'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    # Whitespace Normalization
    df['cleaned_message'] = df['cleaned_message'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    # URL Removal
    df['cleaned_message'] = df['cleaned_message'].apply(lambda x: re.sub(r'http\S+', '', x))
    # Tokenization and Stopword Removal
    stop_words = set(stopwords.words('english'))
    df['tokens'] = df['cleaned_message'].apply(lambda x: word_tokenize(x))
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

    # Stemming
    stemmer = PorterStemmer()
    df['tokens'] = df['tokens'].apply(lambda x:[stemmer.stem(token) for token in x])
    # Rare Word Removal
    all_tokens = [token for tokens in df['tokens'] for token in tokens]
    token_counts = Counter(all_tokens)
    rare_words = set(token for token, count in token_counts.items() if count <= 1)
    df['processed_tokens'] = df['tokens'].apply(lambda x: ' '.join([token for token in x if token not in rare_words])) # Join tokens into a single string
    return df
# Apply preprocessing to the dataset
dataset = preprocessor(dataset)

# Display the preprocessed dataset
dataset.head()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# Split the dataset into features (X) and target variable (y)
# X = dataset['messages']
# X = dataset['cleaned_message']
X = dataset['processed_tokens']
y = dataset['target']

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the text data into numerical features
X_tfidf = tfidf_vectorizer.fit_transform(X)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score,accuracy_score,precision_score
from sklearn.model_selection import train_test_split
import pandas as pd


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "XGBoost Classifier": XGBClassifier()
}
f1_scores=[]
accuracy_scores=[]
precision_scores=[]
# Train classifiers and calculate F1 score
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)  # Assuming X_train_tfidf is the TF-IDF transformed training data
    y_pred = clf.predict(X_test)  # Assuming X_test_tfidf is the TF-IDF transformed testing data
    f1 = f1_score(y_test, y_pred, average='binary')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    f1_scores.append((name,f1))
    accuracy_scores.append((name,accuracy))
    precision_scores.append((name,precision))
    print(f"F1 Score for {name}: {f1}")
    print(f"Accuracy Score for {name}: {accuracy}")
    print(f"Precision Score for {name}: {precision}")
    print("-"*50)



In [None]:
import matplotlib.pyplot as plt

def plot_f1_scores(f1_Scores):
    """
    Function to plot F1 scores of algorithms in a bar plot.

    Parameters:
    - f1_scores: List of tuples containing algorithm names and corresponding F1 scores.
    """

    # Extract algorithm names and F1 scores from the list of tuples
    algorithms, scores = zip(*f1_Scores)

    # Plot bar plot
    plt.figure(figsize=(10, 6))
    plt.barh(algorithms, scores, color='blue')
    plt.title('F1 Scores of Algorithms')
    plt.xlabel('F1 Score')
    plt.ylabel('Algorithms')
    plt.xlim(0, 1)  # Setting x-axis limit from 0 to 1
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

plot_f1_scores(f1_scores)


In [None]:
print(f1_scores)

In [None]:
algorithm_performance = pd.DataFrame(accuracy_scores, columns=['Algorithm', 'Accuracy'])
algorithm_performance['Precision'] = pd.DataFrame(precision_scores)[1]
algorithm_performance['F1 Score'] = pd.DataFrame(f1_scores)[1]
algorithm_performance

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Define a pipeline with TfidfVectorizer and MultinomialNB
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Define the parameter grid
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'tfidf__max_df': [0.5, 0.75, 1.0],  # Maximum document frequency
    'clf__alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
}

# Create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

# Train the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
accuracy = best_model.score(X_test, y_test)
print("Accuracy on test set:", accuracy)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = dataset['messages']
y = dataset['target']

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Transform the text data into numerical features
X_tfidf = tfidf_vectorizer.fit_transform(X)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)  # Assuming X_train_tfidf is the TF-IDF transformed training data
y_pred = xgb_classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, f1_score, accuracy_score,precision_score
f1=f1_score(y_test, y_pred, average='binary')
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print("F1 score:", f1)
print("Accuracy score:", accuracy)
print("Precision:", precision)