<a href="https://colab.research.google.com/github/Jyoti-Hajjargi/Google-Search-Queries-Anomaly-Detection/blob/main/Google_Search_Queries_Anomaly_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [83]:
import pandas as pd
from collections import Counter #count word frequency
import re #text cleaning and word extraction.
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white" #default Plotly theme to white background

Loading the Dataset

In [84]:
from google.colab import files
uploaded = files.upload()

Saving Queries.csv to Queries (3).csv


In [85]:
queries_df = pd.read_csv("Queries.csv")
print(queries_df.head())

                                 Top queries  Clicks  Impressions     CTR  \
0                number guessing game python    5223        14578  35.83%   
1                        thecleverprogrammer    2809         3456  81.28%   
2           python projects with source code    2077        73380   2.83%   
3  classification report in machine learning    2012         4959  40.57%   
4                      the clever programmer    1931         2528  76.38%   

   Position  
0      1.61  
1      1.02  
2      5.94  
3      1.28  
4      1.09  


Exploratory Data Analysis

In [86]:
queries_df.isna().sum()

Unnamed: 0,0
Top queries,0
Clicks,0
Impressions,0
CTR,0
Position,0


In [87]:
queries_df.shape

(1000, 5)

In [88]:
queries_df.size

5000

In [89]:
print(queries_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Top queries  1000 non-null   object 
 1   Clicks       1000 non-null   int64  
 2   Impressions  1000 non-null   int64  
 3   CTR          1000 non-null   object 
 4   Position     1000 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB
None


Cleaning the CTR Column

In [90]:
queries_df['CTR'] = queries_df['CTR'].str.rstrip('%').astype('float') / 100
queries_df.head()

Unnamed: 0,Top queries,Clicks,Impressions,CTR,Position
0,number guessing game python,5223,14578,0.3583,1.61
1,thecleverprogrammer,2809,3456,0.8128,1.02
2,python projects with source code,2077,73380,0.0283,5.94
3,classification report in machine learning,2012,4959,0.4057,1.28
4,the clever programmer,1931,2528,0.7638,1.09


Word Frequency Analysis

In [91]:
#Define Function to Clean and Split Queries
def clean_and_split(query):
    words = re.findall(r'\b[a-zA-Z]+\b', query.lower())
    return words

Count and plot top 20 words:

In [92]:
# Split each query into words and count the frequency of each word
word_counts = Counter()
for query in queries_df['Top queries']:
    word_counts.update(clean_and_split(query))

In [93]:
#Create a bar chart showing the 20 most common words across all queries
word_freq_df = pd.DataFrame(word_counts.most_common(20), columns=['Word', 'Frequency'])

In [94]:
# Plotting the word frequencies
fig = px.bar(word_freq_df, x='Word', y='Frequency', title='Top 20 Most Common Words in Search Queries')
fig.show()
#This visualization shows what people commonly search for.

Top Queries by Clicks and Impressions

In [95]:
top_queries_clicks_vis = queries_df.nlargest(10, 'Clicks')[['Top queries', 'Clicks']] # top 10 queries with highest Clicks.
top_queries_impressions_vis = queries_df.nlargest(10, 'Impressions')[['Top queries', 'Impressions']] #Similarly, we do it for Impressions.

Plotting

In [96]:
# visualize both using separate bar charts.
fig_clicks = px.bar(top_queries_clicks_vis, x='Top queries', y='Clicks', title='Top Queries by Clicks')
fig_impressions = px.bar(top_queries_impressions_vis, x='Top queries', y='Impressions', title='Top Queries by Impressions')
fig_clicks.show()
fig_impressions.show()

Queries with Highest and Lowest CTR

In [97]:
# Queries with highest and lowest CTR
top_ctr_vis = queries_df.nlargest(10, 'CTR')[['Top queries', 'CTR']]
bottom_ctr_vis = queries_df.nsmallest(10, 'CTR')[['Top queries', 'CTR']]

# Plotting
fig_top_ctr = px.bar(top_ctr_vis, x='Top queries', y='CTR', title='Top Queries by CTR')
fig_bottom_ctr = px.bar(bottom_ctr_vis, x='Top queries', y='CTR', title='Bottom Queries by CTR')
fig_top_ctr.show()
fig_bottom_ctr.show()

 Correlation Matrix

In [98]:
correlation_matrix = queries_df[['Clicks', 'Impressions', 'CTR', 'Position']].corr()
fig_corr = px.imshow(correlation_matrix, text_auto=True, title='Correlation Matrix')
fig_corr.show()

 Anomaly Detection using Isolation Forest

In [102]:
from sklearn.ensemble import IsolationForest

# Selecting relevant features
features = queries_df[['Clicks', 'Impressions', 'CTR', 'Position']]

# Initializing Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination=0.01)  #  number of trees in the forest

# Fitting the model
iso_forest.fit(features)

Label anomalies

In [103]:
# Predicting anomalies
queries_df['anomaly'] = iso_forest.predict(features)

Filter and display anomalies

In [105]:
# Filtering out the anomalies
anomalies = queries_df[queries_df['anomaly'] == -1]

print(anomalies[['Top queries', 'Clicks', 'Impressions', 'CTR', 'Position']])

                                  Top queries  Clicks  Impressions     CTR  \
0                 number guessing game python    5223        14578  0.3583   
1                         thecleverprogrammer    2809         3456  0.8128   
2            python projects with source code    2077        73380  0.0283   
3   classification report in machine learning    2012         4959  0.4057   
4                       the clever programmer    1931         2528  0.7638   
7                 python turtle graphics code    1455        13585  0.1071   
11                          clever programmer    1243        21566  0.0576   
15                 rock paper scissors python    1111        35824  0.0310   
21                      classification report     933        39896  0.0234   
34                   machine learning roadmap     708        42715  0.0166   

    Position  
0       1.61  
1       1.02  
2       5.94  
3       1.28  
4       1.09  
7       4.60  
11      4.82  
15      7.19  
21    