## Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import gensim
import nltk
import spacy
import re
import spacy
import matplotlib.pyplot as plt
import string

from ydata_profiling import ProfileReport
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk import tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from IPython.display import Image
from spacy import displacy
from transformers import pipeline
from itertools import product

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read data

In [None]:
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")
sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")
summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")

## Explore data

In [None]:
prompts_train.head()

In [None]:
prompts_train.dtypes

In [None]:
prompts_test.head()

In [None]:
prompts_test.dtypes

In [None]:
sample_submission.head()

In [None]:
sample_submission.dtypes

In [None]:
summaries_test.head()

In [None]:
summaries_test.dtypes

In [None]:
summaries_train.head()

In [None]:
summaries_train.dtypes

In [None]:
# Joining summaries_train and prompts_train on
merged_train = pd.merge(summaries_train, prompts_train, on="prompt_id", how="outer")
merged_train.head()

In [None]:
profile = ProfileReport(merged_train, title="Profiling Report")
profile

As we can see vording and content are highly correlated, but this is predictable, because usually, if people write work well, both values will be high
## Label distribution

In [None]:
plt.figure(figsize = (15, 5))
plt.subplot(121)
sns.histplot(data=merged_train, x='content')
plt.subplot(122)
sns.histplot(data=merged_train, x='wording')
plt.show()

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def collapse_dots(input):
    # Collapse sequential dots
    input = re.sub("\.+", ".", input)
    # Collapse dots separated by whitespaces
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def process_text(text):
    # Check if input is a string
    if not isinstance(text, str):
        return text

    # Initialize stopwords, lemmatizer, and punctuation set
    stop_words = set(stopwords.words('english'))

    # Remove links
    # text = re.sub(r"http\S+", "", text)
    text = re.sub(r'[\r\n]+', ". ", text)
    # text = text.replace("\r\n", ". ")
    # Remove period occurence with those symbols
    for symb in ["!", ",", ":", ";", "?"]:
        text = re.sub(rf"\{symb}\.", symb, text)
    #input = re.sub("[^а-яА-Яa-zA-Z0-9!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", input)
    text = re.sub("[^a-zA-Z0-9!\’\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", text)
    # Remove hashtags
    text = re.sub(r"#\S+", "", text)
    text = collapse_dots(text)
    text = text.lower()
    
    # # Lemmatize
    # nlp = spacy.load("en_core_web_sm")
    # doc = nlp(text)
    # text = " ".join([token.lemma_ for token in doc])

    # Remove leading and ending whitespace
    text = text.strip()
    return text

In [None]:
merged_train["clean_text"] = merged_train["text"].apply(process_text)
for idx in [10, 100 , 150]:
    print(
        f"Before : {merged_train['text'].iloc[idx]}\n"
        f"Light Processing : {merged_train['clean_text'].iloc[idx]}\n"
    )

In [None]:
# Count the stop words in the text.
def count_stopwords(text: str) -> int:
    stopword_list = set(stopwords.words('english'))
    words = text.split()
    stopwords_count = sum(1 for word in words if word.lower() in stopword_list)
    return stopwords_count

# Count the punctuations in the text.
# punctuation_set -> !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
def count_punctuation(text: str) -> int:
    punctuation_set = set(string.punctuation)
    punctuation_count = sum(1 for char in text if char in punctuation_set)
    return punctuation_count

# Count the digits in the text.
def count_numbers(text: str) -> int:
    numbers = re.findall(r'\d+', text)
    numbers_count = len(numbers)
    return numbers_count

# This function applies all the above preprocessing functions on a text feature.
def feature_engineer(dataframe_real: pd.DataFrame, feature: str = 'clean_text') -> pd.DataFrame:
    dataframe = dataframe_real.copy()
    dataframe[f'{feature}_word_cnt'] = dataframe[feature].apply(lambda x: len(x.split(' ')))
    dataframe[f'{feature}_length'] = dataframe[feature].apply(lambda x: len(x))
    dataframe[f'{feature}_stopword_cnt'] = dataframe[feature].apply(lambda x: count_stopwords(x))
    dataframe[f'{feature}_punct_cnt'] = dataframe[feature].apply(lambda x: count_punctuation(x))
    dataframe[f'{feature}_number_cnt'] = dataframe[feature].apply(lambda x: count_numbers(x))
    return dataframe

In [None]:
merged_train_stats = feature_engineer(merged_train)
merged_train_stats.head()
# text_length is measured in symbols

In [None]:
merged_train_stats.describe()

In [None]:
merged_train_stats["merged_text"] = (
    merged_train_stats["prompt_title"] + ". " + merged_train_stats["prompt_question"] + " " +  merged_train_stats["prompt_text"] + ". " + merged_train_stats["text"]
)

merged_train_stats["merged_text"].iloc[10]