### SQuAD Dataset Exploratory Data Analysis (EDA)

#### By Group 3:

| Name       | ID   |
|------------|------|
| Muhammad Affan Naved      | 25100283  |
| Hassan Ali        | 25100037  |
| Talha Tariq    | 25100041  |


#### Import libraries

In [1]:
# Libraries needed for preprocessing and eda on SQuAD dataset
import pandas as pd # squad is in csv format with columns [index,question,context,answer_start,text,c_id]
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re
import spacy
import string
import nltk
from nltk.corpus import stopwords

#### Load dataset

In [2]:
input_pth = 'DM_Dataset/v1.1/train.csv'

# Load the dataset
df = pd.read_csv(input_pth)
df.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


### Prepocessing the Dataset

In [3]:
#Preprocessing

# Rename columns for readability

df.rename(columns={"text": "answer", "c_id": "context_id"}, inplace=True)

# Display first few rows
df.head()


Unnamed: 0,index,question,context,answer_start,answer,context_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


In [4]:
# Check for duplicates
print(f"Duplicates before removal: {df.duplicated().sum()}")

# Remove duplicates
df = df.drop_duplicates()

# Check again
print(f"Duplicates after removal: {df.duplicated().sum()}")


Duplicates before removal: 0
Duplicates after removal: 0


In [5]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df = df.dropna()

# Confirm removal
print(df.isnull().sum())


index           0
question        0
context         0
answer_start    0
answer          3
context_id      0
dtype: int64
index           0
question        0
context         0
answer_start    0
answer          0
context_id      0
dtype: int64


In [6]:
# Convert text to lowercase and strip unnecessary spaces
df["question"] = df["question"].str.lower().str.strip()
df["context"] = df["context"].str.lower().str.strip()
df["answer"] = df["answer"].str.lower().str.strip()

# Display first few rows
df.head()

Unnamed: 0,index,question,context,answer_start,answer,context_id
0,5733be284776f41900661182,to whom did the virgin mary allegedly appear i...,"architecturally, the school has a catholic cha...",515,saint bernadette soubirous,0
1,5733be284776f4190066117f,what is in front of the notre dame main building?,"architecturally, the school has a catholic cha...",188,a copper statue of christ,0
2,5733be284776f41900661180,the basilica of the sacred heart at notre dame...,"architecturally, the school has a catholic cha...",279,the main building,0
3,5733be284776f41900661181,what is the grotto at notre dame?,"architecturally, the school has a catholic cha...",381,a marian place of prayer and reflection,0
4,5733be284776f4190066117e,what sits on top of the main building at notre...,"architecturally, the school has a catholic cha...",92,a golden statue of the virgin mary,0


In [10]:
#remove stopwords, punctuation and special characters
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = " ".join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

df["question_clean"] = df["question"].apply(clean_text)
df["context_clean"] = df["context"].apply(clean_text)
df["answer_clean"] = df["answer"].apply(clean_text)

df.head()

Unnamed: 0,index,question,context,answer_start,answer,context_id,question_clean,context_clean,answer_clean
0,5733be284776f41900661182,to whom did the virgin mary allegedly appear i...,"architecturally, the school has a catholic cha...",515,saint bernadette soubirous,0,virgin mary allegedly appear 1858 lourdes france,architecturally school catholic character atop...,saint bernadette soubirous
1,5733be284776f4190066117f,what is in front of the notre dame main building?,"architecturally, the school has a catholic cha...",188,a copper statue of christ,0,front notre dame main building,architecturally school catholic character atop...,copper statue christ
2,5733be284776f41900661180,the basilica of the sacred heart at notre dame...,"architecturally, the school has a catholic cha...",279,the main building,0,basilica sacred heart notre dame beside structure,architecturally school catholic character atop...,main building
3,5733be284776f41900661181,what is the grotto at notre dame?,"architecturally, the school has a catholic cha...",381,a marian place of prayer and reflection,0,grotto notre dame,architecturally school catholic character atop...,marian place prayer reflection
4,5733be284776f4190066117e,what sits on top of the main building at notre...,"architecturally, the school has a catholic cha...",92,a golden statue of the virgin mary,0,sits top main building notre dame,architecturally school catholic character atop...,golden statue virgin mary


In [None]:
# Tokenize and lemmatize text
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    text = " ".join(token.lemma_ for token in doc)
    return text

df["question_lemma"] = df["question_clean"].apply(lemmatize_text)
df["context_lemma"] = df["context_clean"].apply(lemmatize_text)
df["answer_lemma"] = df["answer_clean"].apply(lemmatize_text)

df.head()

### EDA

#### Overview

In [19]:
#Dataset Overview

print(f"Shape of the dataset: {df.shape}")
print(f"Columns: {df.columns}")
print(f"\nData types: {df.dtypes}")
print(f"\nNumber of unique values in each column: {df.nunique()}")




Shape of the dataset: (87596, 9)
Columns: Index(['index', 'question', 'context', 'answer_start', 'answer', 'context_id',
       'question_clean', 'context_clean', 'answer_clean'],
      dtype='object')

Data types: index             object
question          object
context           object
answer_start       int64
answer            object
context_id         int64
question_clean    object
context_clean     object
answer_clean      object
dtype: object

Number of unique values in each column: index             87596
question          87309
context           18891
answer_start       1604
answer            64261
context_id        18891
question_clean    87085
context_clean     18889
answer_clean      61428
dtype: int64


In [20]:
# Length of each column (word count)
df["question_length"] = df["question"].apply(lambda x: len(x.split()))
df["context_length"] = df["context"].apply(lambda x: len(x.split()))
df["answer_length"] = df["answer"].apply(lambda x: len(x.split()))

# Summary statistics
print(df[["question_length", "context_length", "answer_length"]].describe())

       question_length  context_length  answer_length
count     87596.000000    87596.000000   87596.000000
mean         10.061064      119.762832       3.162233
std           3.559231       49.365597       3.392368
min           1.000000       20.000000       1.000000
25%           8.000000       89.000000       1.000000
50%          10.000000      110.000000       2.000000
75%          12.000000      142.000000       3.000000
max          40.000000      653.000000      43.000000
