# Data Quality Assesment

In [1]:
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import kaggle as kg
import zipfile
import matplotlib.pyplot as plt
import textstat
import re


kg.api.authenticate()

## Downloading the dataset

In [2]:
kg.api.competition_download_files(competition = 'learning-agency-lab-automated-essay-scoring-2', path='../src/datasets', quiet = False)
zf = zipfile.ZipFile('../src/datasets/learning-agency-lab-automated-essay-scoring-2.zip') 
submission = pd.read_csv(zf.open('sample_submission.csv'))
test = pd.read_csv(zf.open('test.csv'))
train = pd.read_csv(zf.open('train.csv'))

learning-agency-lab-automated-essay-scoring-2.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
train.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [4]:
len(train)

17307

In [5]:
test.head()

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [6]:
len(test)

3

## DQA

In [7]:
# Missing Values
missing_values = train.isnull().sum()

print("\nMissing Values:\n", missing_values)


Missing Values:
 essay_id     0
full_text    0
score        0
dtype: int64


In [8]:
# Outlier Detection (using Z-score for 'score' column)
from scipy.stats import zscore

train['score_zscore'] = zscore(train['score'])
outliers = train[abs(train['score_zscore']) > 3]

print("\nNumber of Outliers:", len(outliers))


Number of Outliers: 0


In [9]:
# Check for Duplicates
duplicates = train.duplicated().sum()

print("\nNumber of Duplicates:", duplicates)


Number of Duplicates: 0


In [10]:
# Basic Text Quality Checks
# 1. Length of each essay
train['essay_length'] = train['full_text'].apply(len)

# 2. Count of special characters (e.g., non-alphabetic characters)
train['special_char_count'] = train['full_text'].apply(lambda x: sum(not char.isalnum() for char in x))

# 3. Word Count
train['word_count'] = train['full_text'].apply(lambda x: len(x.split()))

# Display a quick overview of relevant columns
display_columns = ['essay_id', 'full_text', 'score', 'essay_length', 'special_char_count', 'word_count', 'score_zscore']
print(train[display_columns].head())


  essay_id                                          full_text  score  \
0  000d118  Many people have car where they live. The thin...      3   
1  000fe60  I am a scientist at NASA that is discussing th...      3   
2  001ab80  People always wish they had the same technolog...      4   
3  001bdc0  We all heard about Venus, the planet without a...      4   
4  002ba53  Dear, State Senator\n\nThis is a letter to arg...      3   

   essay_length  special_char_count  word_count  score_zscore  
0          2677                 542         498      0.049382  
1          1669                 376         332      0.049382  
2          3077                 617         550      1.006440  
3          2701                 518         451      1.006440  
4          2208                 437         373      0.049382  
