In [1]:
# Anomaly Detection in Queries

# // Question No. 1 \\

# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv("E:\MNS-UET BS IT\AICP ML INTERNSHIP\ML Internship Task 3/Queries.csv")

# Check for null values
print("Null Values:")
print(df.isnull().sum())

# Check column information
print("\nColumn Information:")
print(df.info())

# Descriptive statistics of the data
print("\nDescriptive Statistics:")
print(df.describe())


Null Values:
Top queries    0
Clicks         0
Impressions    0
CTR            0
Position       0
dtype: int64

Column Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Top queries  1000 non-null   object 
 1   Clicks       1000 non-null   int64  
 2   Impressions  1000 non-null   int64  
 3   CTR          1000 non-null   object 
 4   Position     1000 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB
None

Descriptive Statistics:
          Clicks   Impressions     Position
count  1000.0000   1000.000000  1000.000000
mean    172.2750   1939.466000     3.985930
std     281.0221   4856.702605     2.841842
min      48.0000     62.000000     1.000000
25%      64.0000    311.000000     2.010000
50%      94.0000    590.500000     3.120000
75%     169.0000   1582.750000     5.342500
max    5223.0000  73380.

In [None]:
# // Question No. 2 \\

# Convert the CTR column from percentage string to float
df['CTR'] = df['CTR'].str.rstrip('%').astype(float) / 100

# Display the updated DataFrame
print(df.head())


In [None]:
# // Question No. 3 \\

# Import necessary libraries
import pandas as pd
import re
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("E:\MNS-UET BS IT\AICP ML INTERNSHIP\ML Internship Task 3/Queries.csv")

# Define a function to clean and split queries into words
def clean_and_split_query(query):
    """
    Clean and split a query into words.
    """
    # Convert to lowercase
    query = query.lower()
    # Remove special characters and punctuation
    query = re.sub(r'[^\w\s]', '', query)
    # Split the query into words
    words = query.split()
    return words

# Apply the function to clean and split each query
df['Cleaned_Query'] = df['Top queries'].apply(clean_and_split_query)

# Flatten the list of words
all_words = [word for query_words in df['Cleaned_Query'] for word in query_words]

# Count the frequency of each word
word_freq = Counter(all_words)

# Create a DataFrame from the word frequencies
word_freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency'])

# Sort the DataFrame by frequency
word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False)

# Plot the word frequencies using seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Word', y='Frequency', data=word_freq_df.head(20))
plt.title('Top 20 Common Words in Search Queries')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()



In [None]:
# // Question No. 4 \\

import seaborn as sns
import matplotlib.pyplot as plt

# Sort the DataFrame by clicks and impressions
top_queries_by_clicks = df.sort_values(by='Clicks', ascending=False).head(10)
top_queries_by_impressions = df.sort_values(by='Impressions', ascending=False).head(10)

# Plot the top queries by clicks with rotated visualization
plt.figure(figsize=(10, 6))
sns.barplot(x='Top queries', y='Clicks', data=top_queries_by_clicks)
plt.title('Top queries by Clicks')
plt.xlabel('Query')
plt.ylabel('Clicks')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Plot the top queries by impressions with rotated visualization
plt.figure(figsize=(10, 6))
sns.barplot(x='Top queries', y='Impressions', data=top_queries_by_impressions)
plt.title('Top queries by Impressions')
plt.xlabel('Query')
plt.ylabel('Impressions')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# // Question No. 5 \\

import seaborn as sns
import matplotlib.pyplot as plt

# Sort the DataFrame by CTR in descending order to get the queries with the highest CTRs
highest_ctr_queries = df.sort_values(by='CTR', ascending=False).head(10)

# Sort the DataFrame by CTR in ascending order to get the queries with the lowest CTRs
lowest_ctr_queries = df.sort_values(by='CTR', ascending=True).head(10)

# Plot the queries with the highest CTRs
plt.figure(figsize=(10, 6))
sns.barplot(x='CTR', y='Top queries', data=highest_ctr_queries, orient='h')
plt.title('Top queries with Highest CTRs')
plt.xlabel('CTR')
plt.ylabel('Query')
plt.tight_layout()
plt.show()

# Plot the queries with the lowest CTRs
plt.figure(figsize=(10, 6))
sns.barplot(x='CTR', y='Top queries', data=lowest_ctr_queries, orient='h')
plt.title('Top queries with Lowest CTRs')
plt.xlabel('CTR')
plt.ylabel('Query')
plt.tight_layout()
plt.show()


In [None]:
# // Question No. 6 \\

import seaborn as sns
import matplotlib.pyplot as plt

# Remove '%' sign from CTR column and convert to float
df['CTR'] = df['CTR'].str.rstrip('%').astype(float) / 100.0

# Compute the correlation matrix
correlation_matrix = df[['Clicks', 'Impressions', 'CTR', 'Position']].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 12})
plt.title('Correlation Matrix')
plt.show()


In [None]:
# // Question No. 7 \\ 
from sklearn.ensemble import IsolationForest

# Prepare the data for anomaly detection
X = df[['Clicks', 'Impressions', 'CTR', 'Position']]

# Train the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)  # Adjust contamination based on expected anomaly rate
isolation_forest.fit(X)

# Predict anomalies
anomaly_scores = isolation_forest.decision_function(X)
anomaly_predictions = isolation_forest.predict(X)

# Add anomaly predictions to DataFrame
df['Anomaly'] = anomaly_predictions

# Visualize anomalies
plt.figure(figsize=(10, 6))
plt.scatter(df['Impressions'], df['Clicks'], c=df['Anomaly'], cmap='coolwarm', alpha=0.5)
plt.xlabel('Impressions')
plt.ylabel('Clicks')
plt.title('Anomalies in Search Queries')
plt.colorbar(label='Anomaly')
plt.show()

# Display queries flagged as anomalies
anomalies_df = df[df['Anomaly'] == -1]  # Selecting rows where Anomaly prediction is -1
print("Anomalies Detected in Search Queries:")
print(anomalies_df[['Top queries', 'Clicks', 'Impressions', 'CTR', 'Position']])
