In [1]:
import PyPDF2
import re
import pandas as pd
import os
import tiktoken
from textblob import TextBlob

In [2]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    :param pdf_path: Path to the PDF file.
    :return: Extracted text as a string.
    """
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
    
    return text

In [3]:
folder_path = "PDF files for stat eval project"

data = []
for filename in os.listdir(folder_path):
    if filename.lower().endswith(".pdf"):
        full_path = os.path.join(folder_path, filename)
        extracted_text = extract_text_from_pdf(full_path)
        data.append({
            "country": os.path.splitext(filename)[0],
            "text": extracted_text
        })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

In [4]:
prompt = r"(?s).*?What educational path would you recommend for me\?"

# Create a new column (or overwrite) with only the text after that question:
df["answer"] = df["text"].apply(
    lambda t: re.sub(prompt, "", t)
)

In [5]:
pattern = r"Printed using ChatGPT to PDF, powered by PDFCrowd HTML to PDF API\. \d+/\d+"

df["answer"] = df["answer"].str.replace(pattern, "", regex=True)

In [6]:
df["word_count"] = df["answer"].apply(lambda s: len(s.split()) if isinstance(s, str) else 0)

In [7]:
def count_unique_words(s: str) -> int:
    if not isinstance(s, str):
        return 0
    # Remove punctuation, lowercase, split on whitespace
    words = re.findall(r"\b\w+\b", s.lower())
    return len(set(words))
df["unique_word_count"] = df["answer"].apply(count_unique_words)

In [8]:
encoding = tiktoken.encoding_for_model("gpt-4o")
def count_tokens_tiktoken(s: str) -> int:
    if not isinstance(s, str):
        return 0
    # .encode() returns a list of token‐IDs, so its length is the token count
    return len(encoding.encode(s))

# 4) Apply it to your DataFrame:
df["token_count"] = df["answer"].apply(count_tokens_tiktoken)

print(df[["country", "token_count"]])

                                country  token_count
0                           Afghanistan          602
1                               Albania          562
2                               Algeria          599
3                               Andorra          626
4                                Angola          547
..                                  ...          ...
188  Venezuela (Bolivarian Republic of)          828
189                            Viet Nam          591
190                               Yemen          564
191                              Zambia          626
192                            Zimbabwe          771

[193 rows x 2 columns]


In [9]:
def avg_word_length(s: str) -> float:
    if not isinstance(s, str) or not s.strip():
        return 0.0
    words = re.findall(r"\b\w+\b", s)
    avg = sum(len(w) for w in words) / max(len(words), 1)
    return round(avg, 1)

df["avg_word_length"] = df["answer"].apply(avg_word_length)

In [10]:
def sentence_stats(s: str):
    if not isinstance(s, str) or not s.strip():
        return (0, 0.0)
    # Split the text into sentences using regex
    sentences = re.split(r"[.!?]+", s.strip())
    # Remove empty sentences and strip whitespace
    sentences = [sent.strip() for sent in sentences if sent.strip()]
    count = len(sentences)
    if count == 0:
        return (0, 0.0)
    total_words = sum(len(sent.split()) for sent in sentences)
    return count, round(total_words / count, 1)

df[["sentence_count", "avg_sentence_length"]] = df["answer"]\
    .apply(lambda s: pd.Series(sentence_stats(s)))

In [11]:
"""
Return a tuple of form (polarity, subjectivity ) 
where polarity is a float within the range [-1.0, 1.0] 
and subjectivity is a float within the range [0.0, 1.0] 
where 0.0 is very objective and 1.0 is very subjective.
"""

def sentiment_textblob(s: str) -> float:
    if not isinstance(s, str) or not s.strip():
        return 0.0
    return TextBlob(s).sentiment.polarity  # range [-1.0, 1.0]

df["sentiment_polarity"] = df["answer"].apply(sentiment_textblob)

In [12]:
import textstat

def readability_scores(s: str) -> dict:
    if not isinstance(s, str) or not s.strip():
        return {"flesch_reading_ease": 0.0}
    return {
        "flesch_reading_ease": textstat.flesch_reading_ease(s)
    }

# Expand your DataFrame:
scores_df = df["answer"].apply(lambda s: pd.Series(readability_scores(s)))
df = pd.concat([df, scores_df], axis=1)

In [13]:
emoji_pattern = re.compile(
    "["                      # start character class
    "\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Misc Symbols & Pictographs
    "\U0001F680-\U0001F6FF"  # Transport & Map
    "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
    "\U00002700-\U000027BF"  # Dingbats
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols & Pictographs
    "\U00002600-\U000026FF"  # Misc Symbols
    "\U0001F700-\U0001F77F"  # Alchemical Symbols
    "]+", 
    flags=re.UNICODE
)

def count_emojis(s: str) -> int:
    return len(emoji_pattern.findall(s or ""))

# 3. Apply it to your DataFrame column (for example, 'text' or 'trimmed_text')
df["emoji_count"] = df["answer"].apply(count_emojis)

In [14]:
def count_ib_acronym(s: str) -> int:
    """
    Count only standalone occurrences of 'IB' (case‐insensitive),
    including when wrapped in parentheses like '(IB)'.
    """
    if not isinstance(s, str):
        return 0
    # \b ensures IB is not part of a longer word. 
    # Flags=re.IGNORECASE lets us catch 'IB', 'ib', 'Ib', etc.
    return len(re.findall(r"\bIB\b", s, flags=re.IGNORECASE))

df["ib_count"] = df["answer"].apply(count_ib_acronym)

def count_specific_terms(s: str, terms: list) -> dict:
    if not isinstance(s, str):
        return {term: 0 for term in terms}
    lower_s = s.lower()
    return {term: lower_s.count(term) for term in terms}

keyWords = ["personal", "tailor", "htx", "stx", "hf", "hhx", "10", "fgu", "eux", "eud", "?", "!", "vet", "erhverv", "university",  "if you", "uu-vejleder", "background", "hobb", "goal", "interest"]

df["keywords"] = df["answer"].apply(lambda s: count_specific_terms(s, keyWords))
tempDF = pd.DataFrame(df[["country", "keywords"]])

tempDF

Unnamed: 0,country,keywords
0,Afghanistan,"{'personal': 0, 'tailor': 0, 'htx': 2, 'stx': ..."
1,Albania,"{'personal': 0, 'tailor': 1, 'htx': 1, 'stx': ..."
2,Algeria,"{'personal': 1, 'tailor': 1, 'htx': 1, 'stx': ..."
3,Andorra,"{'personal': 1, 'tailor': 1, 'htx': 2, 'stx': ..."
4,Angola,"{'personal': 1, 'tailor': 1, 'htx': 1, 'stx': ..."
...,...,...
188,Venezuela (Bolivarian Republic of),"{'personal': 1, 'tailor': 0, 'htx': 1, 'stx': ..."
189,Viet Nam,"{'personal': 0, 'tailor': 1, 'htx': 2, 'stx': ..."
190,Yemen,"{'personal': 0, 'tailor': 0, 'htx': 3, 'stx': ..."
191,Zambia,"{'personal': 0, 'tailor': 1, 'htx': 1, 'stx': ..."


In [15]:
# Expand the 'keywords' dictionary into separate columns
keywords_expanded = df["keywords"].apply(pd.Series)

# Concatenate the expanded columns to the original DataFrame (excluding 'keywords')
df = pd.concat([df.drop(columns=["keywords"]), keywords_expanded], axis=1)

In [16]:
df

Unnamed: 0,country,text,answer,word_count,unique_word_count,token_count,avg_word_length,sentence_count,avg_sentence_length,sentiment_polarity,...,!,vet,erhverv,university,if you,uu-vejleder,background,hobb,goal,interest
0,Afghanistan,Uddannelsesmuligheder i Danmark\nI'm a 16 year...,\nThank you for sharing your background — that...,420,198,602,5.0,26.0,16.2,0.182529,...,0,0,2,3,8,0,2,0,0,1
1,Albania,Uddannelsesmuligheder efter folkeskole\nI'm a ...,\nThank you for sharing a bit about yourself —...,396,211,562,5.1,27.0,14.7,0.226125,...,0,0,1,5,13,0,0,0,1,4
2,Algeria,Uddannelsesmuligheder efter folkeskole\nI'm a ...,\nThank you for sharing a bit about yourself —...,418,232,599,5.0,26.0,16.1,0.199577,...,0,3,1,3,13,0,1,0,0,4
3,Andorra,Uddannelsesvalg efter folkeskolen\nI'm a 16 ye...,\nThank you for sharing a bit about yourself —...,434,224,626,5.2,34.0,12.9,0.177300,...,0,1,1,2,10,0,1,0,0,5
4,Angola,Uddannelsesvalg efter folkeskole\nI'm a 16 yea...,\nThanks for sharing — you're in a very import...,370,202,547,5.0,23.0,16.1,0.242352,...,0,3,1,2,8,0,1,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,Venezuela (Bolivarian Republic of),Uddannelsesvalg efter Folkeskole\nI'm a 16 yea...,\nThanks for sharing a bit about yourself — th...,579,287,828,5.0,33.0,17.5,0.204796,...,1,0,2,7,11,0,3,0,1,2
189,Viet Nam,Uddannelsesmuligheder efter folkeskolen\nI'm a...,\nThanks for sharing that about yourself — and...,406,212,591,5.2,38.0,10.8,0.228874,...,2,0,1,4,6,0,1,0,2,3
190,Yemen,Uddannelsesmuligheder i Danmark\nI'm a 16 year...,\nThat's great — you’re in an exciting positio...,383,203,564,5.0,27.0,14.3,0.147194,...,0,0,1,3,3,1,0,0,1,6
191,Zambia,Uddannelsesmuligheder i Danmark\nI'm a 16 year...,\nThanks for sharing that — you're in an impor...,433,234,626,5.0,26.0,16.7,0.208594,...,0,0,1,6,4,0,1,0,1,3


In [17]:
# Display summary statistics for numeric columns in df
df[['flesch_reading_ease']].describe()

Unnamed: 0,flesch_reading_ease
count,193.0
mean,45.295841
std,5.427663
min,30.455215
25%,41.903765
50%,45.607309
75%,49.081286
max,57.29912


In [18]:
df.describe()

Unnamed: 0,word_count,unique_word_count,token_count,avg_word_length,sentence_count,avg_sentence_length,sentiment_polarity,flesch_reading_ease,emoji_count,ib_count,...,!,vet,erhverv,university,if you,uu-vejleder,background,hobb,goal,interest
count,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,...,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0,193.0
mean,443.564767,227.300518,648.207254,5.031606,32.129534,14.270466,0.203685,45.295841,4.694301,0.891192,...,0.357513,0.440415,1.036269,3.699482,7.989637,0.11399,1.373057,0.165803,0.673575,3.989637
std,45.286555,18.185183,62.093332,0.157076,6.236553,2.505662,0.034374,5.427663,2.730007,1.243137,...,0.646834,0.977787,0.543648,1.404179,3.418683,0.318625,0.987262,0.372871,0.655248,1.590956
min,338.0,185.0,480.0,4.7,17.0,9.1,0.09375,30.455215,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,411.0,216.0,603.0,4.9,27.0,12.7,0.181107,41.903765,2.0,0.0,...,0.0,0.0,1.0,3.0,6.0,0.0,1.0,0.0,0.0,3.0
50%,440.0,228.0,643.0,5.0,31.0,14.0,0.205175,45.607309,5.0,0.0,...,0.0,0.0,1.0,4.0,8.0,0.0,1.0,0.0,1.0,4.0
75%,474.0,238.0,693.0,5.1,37.0,16.0,0.226899,49.081286,7.0,1.0,...,1.0,0.0,1.0,4.0,10.0,0.0,2.0,0.0,1.0,5.0
max,579.0,287.0,828.0,5.4,52.0,24.9,0.323953,57.29912,11.0,7.0,...,3.0,4.0,3.0,8.0,16.0,1.0,4.0,1.0,4.0,8.0


In [19]:
df.columns

Index(['country', 'text', 'answer', 'word_count', 'unique_word_count',
       'token_count', 'avg_word_length', 'sentence_count',
       'avg_sentence_length', 'sentiment_polarity', 'flesch_reading_ease',
       'emoji_count', 'ib_count', 'personal', 'tailor', 'htx', 'stx', 'hf',
       'hhx', '10', 'fgu', 'eux', 'eud', '?', '!', 'vet', 'erhverv',
       'university', 'if you', 'uu-vejleder', 'background', 'hobb', 'goal',
       'interest'],
      dtype='object')

In [20]:
df.to_csv("answer_data.csv", index=False)

In [21]:
additional_data = pd.read_csv("member_state_auths_2025-03-14.csv")
additional_data

Unnamed: 0,Member State,M49 Code,ISO Code,Start date,End date,Other Names,Earlier Name,Later Name,Geographic Term,Membership Document Symbol,Scope Note,French,Spanish,Arabic,Chinese,Russian
0,Afghanistan,4,AFG,11/09/1946,,"Islamic Republic of Afghanistan, Transitional ...",,,AFGHANISTAN,A/RES/34 (I),"Per UNTERM, formal name (or long form) ""Islami...",Afghanistan,Afganistán,أفغانستان,阿富汗,Афганистан
1,Albania,8,ALB,14/12/1955,,Republic of Albania,,,ALBANIA,A/RES/995 (X),,Albanie,Albania,ألبانيا,阿尔巴尼亚,Албания
2,Algeria,12,DZA,10/08/1962,,"People's Democratic Republic of Algeria, Al-Ja...",,,ALGERIA,A/RES/1754 (XVII),,Algérie,Argelia,الجزائر,阿尔及利亚,Алжир
3,Andorra,20,AND,28/07/1993,,Principality of Andorra,,,ANDORRA,A/RES/47/232,,Andorre,Andorra,أندورا,安道尔,Андорра
4,Angola,24,AGO,01/12/1976,,Republic of Angola,,,ANGOLA,A/RES/31/44,,Angola,Angola,أنغولا,安哥拉,Ангола
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,Venezuela (Bolivarian Republic of),862,VEN,18/11/2004,,Bolivarian Republic of Venezuela,Venezuela,,VENEZUELA (BOLIVARIAN REPUBLIC OF),,"By letter of 17 Nov. 2004, the Permanent Missi...",Venezuela (République bolivarienne du),Venezuela (República Bolivariana de),فنزويلا (جمهورية - البوليفارية),委内瑞拉玻利瓦尔共和国,Венесуэла (Боливарианская Республика)
189,Viet Nam,704,VNM,02/07/1976,,"Socialist Republic of Viet Nam, Công Hoa Xa Hô...",Democratic Republic of Viet-Nam,,VIET NAM,A/RES/32/2,Use for works published from 2 July 1976,Viet Nam,Viet Nam,فييت نام,越南,Вьетнам
190,Yemen,887,YEM,22/05/1990,,Republic of Yemen,Yemen Arab Republic,,YEMEN,,,Yémen,Yemen,اليمن,也门,Йемен
191,Zambia,894,ZMB,01/12/1964,,Republic of Zambia,Northern Rhodesia,,ZAMBIA,A/5815,,Zambie,Zambia,زامبيا,赞比亚,Замбия


In [22]:
joined_df = pd.merge(df, additional_data, left_on="country", right_on="Member State", how="left")
joined_df

Unnamed: 0,country,text,answer,word_count,unique_word_count,token_count,avg_word_length,sentence_count,avg_sentence_length,sentiment_polarity,...,Earlier Name,Later Name,Geographic Term,Membership Document Symbol,Scope Note,French,Spanish,Arabic,Chinese,Russian
0,Afghanistan,Uddannelsesmuligheder i Danmark\nI'm a 16 year...,\nThank you for sharing your background — that...,420,198,602,5.0,26.0,16.2,0.182529,...,,,AFGHANISTAN,A/RES/34 (I),"Per UNTERM, formal name (or long form) ""Islami...",Afghanistan,Afganistán,أفغانستان,阿富汗,Афганистан
1,Albania,Uddannelsesmuligheder efter folkeskole\nI'm a ...,\nThank you for sharing a bit about yourself —...,396,211,562,5.1,27.0,14.7,0.226125,...,,,ALBANIA,A/RES/995 (X),,Albanie,Albania,ألبانيا,阿尔巴尼亚,Албания
2,Algeria,Uddannelsesmuligheder efter folkeskole\nI'm a ...,\nThank you for sharing a bit about yourself —...,418,232,599,5.0,26.0,16.1,0.199577,...,,,ALGERIA,A/RES/1754 (XVII),,Algérie,Argelia,الجزائر,阿尔及利亚,Алжир
3,Andorra,Uddannelsesvalg efter folkeskolen\nI'm a 16 ye...,\nThank you for sharing a bit about yourself —...,434,224,626,5.2,34.0,12.9,0.177300,...,,,ANDORRA,A/RES/47/232,,Andorre,Andorra,أندورا,安道尔,Андорра
4,Angola,Uddannelsesvalg efter folkeskole\nI'm a 16 yea...,\nThanks for sharing — you're in a very import...,370,202,547,5.0,23.0,16.1,0.242352,...,,,ANGOLA,A/RES/31/44,,Angola,Angola,أنغولا,安哥拉,Ангола
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,Venezuela (Bolivarian Republic of),Uddannelsesvalg efter Folkeskole\nI'm a 16 yea...,\nThanks for sharing a bit about yourself — th...,579,287,828,5.0,33.0,17.5,0.204796,...,Venezuela,,VENEZUELA (BOLIVARIAN REPUBLIC OF),,"By letter of 17 Nov. 2004, the Permanent Missi...",Venezuela (République bolivarienne du),Venezuela (República Bolivariana de),فنزويلا (جمهورية - البوليفارية),委内瑞拉玻利瓦尔共和国,Венесуэла (Боливарианская Республика)
189,Viet Nam,Uddannelsesmuligheder efter folkeskolen\nI'm a...,\nThanks for sharing that about yourself — and...,406,212,591,5.2,38.0,10.8,0.228874,...,Democratic Republic of Viet-Nam,,VIET NAM,A/RES/32/2,Use for works published from 2 July 1976,Viet Nam,Viet Nam,فييت نام,越南,Вьетнам
190,Yemen,Uddannelsesmuligheder i Danmark\nI'm a 16 year...,\nThat's great — you’re in an exciting positio...,383,203,564,5.0,27.0,14.3,0.147194,...,Yemen Arab Republic,,YEMEN,,,Yémen,Yemen,اليمن,也门,Йемен
191,Zambia,Uddannelsesmuligheder i Danmark\nI'm a 16 year...,\nThanks for sharing that — you're in an impor...,433,234,626,5.0,26.0,16.7,0.208594,...,Northern Rhodesia,,ZAMBIA,A/5815,,Zambie,Zambia,زامبيا,赞比亚,Замбия
