In [None]:
## Predicting Readability of Texts Using Machine Learning

### 1.1 Loading and Preprocessing Data
import pandas as pd
import nltk
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textstat import flesch_kincaid_grade, dale_chall_readability_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
train_read = pd.read_csv('train_read.csv')
test_read = pd.read_csv('test_read.csv')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

train_read['processed_text'] = train_read['excerpt'].apply(preprocess_text)
test_read['processed_text'] = test_read['excerpt'].apply(preprocess_text)

### 1.2 Feature Engineering
vectorizer = TfidfVectorizer(max_features=1000)
train_tfidf = vectorizer.fit_transform(train_read['processed_text']).toarray()

# Save TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

def compute_features(text):
    return {
        "flesch_kincaid": flesch_kincaid_grade(text),
        "dale_chall": dale_chall_readability_score(text),
        "sentence_length": len(text.split('.')) / max(1, len(text.split())),
        "complex_word_ratio": sum(1 for word in text.split() if len(word) > 6) / max(1, len(text.split()))
    }

train_features = train_read['processed_text'].apply(compute_features).apply(pd.Series)
train_read = pd.concat([train_read, train_features], axis=1)

X = pd.concat([train_read[train_features.columns], pd.DataFrame(train_tfidf)], axis=1)
X.columns = X.columns.astype(str)  # Ensure all column names are strings
y = train_read["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 1.3 Training Machine Learning Models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": xgb.XGBRegressor(),
    "LightGBM": lgb.LGBMRegressor(),
    "CatBoost": cb.CatBoostRegressor(verbose=0)
}

best_model = None
best_mae = float('inf')

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{name} MAE: {mae}")
    if mae < best_mae:
        best_mae = mae
        best_model = model

# Save the best model
with open('readability_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

### 1.4 Predicting Readability for Test Data
# Load TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

# Transform test data
test_tfidf = vectorizer.transform(test_read['processed_text']).toarray()
test_features = test_read['processed_text'].apply(compute_features).apply(pd.Series)
test_read = pd.concat([test_read, test_features], axis=1)

# Ensure feature consistency
test_features_combined = pd.concat([test_read[test_features.columns], pd.DataFrame(test_tfidf)], axis=1)
test_features_combined.columns = test_features_combined.columns.astype(str)

# Load model and make predictions
with open('readability_model.pkl', 'rb') as file:
    model = pickle.load(file)

test_predictions = model.predict(test_features_combined)

# Prepare submission file
submission = pd.DataFrame({
    'id': test_read['id'],
    'readability_score': test_predictions
})

submission.to_csv('submission.csv', index=False)


Linear Regression MAE: 0.7009732238888944
Decision Tree MAE: 0.8402647814056438
Random Forest MAE: 0.5984732166172333
Gradient Boosting MAE: 0.5976987577988884
XGBoost MAE: 0.5755948490452598
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 29846
[LightGBM] [Info] Number of data points in the train set: 2267, number of used features: 977
[LightGBM] [Info] Start training from score -0.964399
LightGBM MAE: 0.5862729720667806
CatBoost MAE: 0.5709145236690176


In [6]:
import nltk

# Download required resources
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.5-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.0.32 pyphen-0.17.2 textstat-0.7.5


In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting numpy<2.0,>=1.16.0 (from catboost)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, catboost
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
Successfully in

In [6]:
pip install --upgrade numpy catboost


Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m


In [10]:
!pip uninstall -y numpy catboost
!pip install --no-cache-dir numpy catboost


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: catboost 1.2.7
Uninstalling catboost-1.2.7:
  Successfully uninstalled catboost-1.2.7
Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m45.7 MB/s[0m eta [36m0:00:

In [1]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NATHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pickle
import pandas as pd
from textstat import flesch_kincaid_grade, dale_chall_readability_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the saved TF-IDF vectorizer and model
with open('tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

with open('readability_model.pkl', 'rb') as file:
    model = pickle.load(file)

# Define test sentences
test_sentences = [
    "The cat sat on the mat.",
    "She enjoys reading books in the library.",
    "We went to the zoo and saw many animals.",
    "Apples and bananas are my favorite fruits.",
    "It is a sunny day, and the sky is blue.",
    "He runs fast and jumps high in the competition.",

    # Difficult Sentences (Complex words, long structure, harder readability)
    "The intricacies of quantum mechanics elude those unfamiliar with advanced physics.",
    "A comprehensive analysis of socio-economic disparities reveals multifaceted systemic inequities.",
    "The juxtaposition of baroque embellishments and modernist minimalism creates a paradoxical aesthetic.",
    "Cognizance of epistemological paradigms necessitates an advanced grasp of philosophical discourse."
]

# Compute readability-based features
def compute_features(text):
    return {
        "flesch_kincaid": flesch_kincaid_grade(text),
        "dale_chall": dale_chall_readability_score(text),
        "sentence_length": len(text.split('.')) / max(1, len(text.split())),
        "complex_word_ratio": sum(1 for word in text.split() if len(word) > 6) / max(1, len(text.split()))
    }

test_features = pd.DataFrame([compute_features(text) for text in test_sentences])

# Transform text using TF-IDF
test_tfidf = vectorizer.transform(test_sentences).toarray()
test_tfidf_df = pd.DataFrame(test_tfidf)

# Combine readability features and TF-IDF features
test_combined = pd.concat([test_features, test_tfidf_df], axis=1)
test_combined.columns = test_combined.columns.astype(str)  # Ensure column names are strings

# Make predictions
predictions = model.predict(test_combined)

# Define readability level interpretation
def interpret_score(score):
    if score >= -0.1:  # New threshold for "Easy"
        return "Easy"
    elif score >= -1.0:  # "Moderate" now covers a better range
        return "Moderate"
    else:
        return "Difficult"

# Print results with readability level
for sentence, score in zip(test_sentences, predictions):
    print(f"Sentence: {sentence}\nReadability Level: {interpret_score(score)} (Score: {score:.3f})\n")


# Print results with readability level
for sentence, score in zip(test_sentences, predictions):
    print(f"Sentence: {sentence}\nReadability Level: {interpret_score(score)} (Score: {score:.3f})\n")


Sentence: The cat sat on the mat.
Readability Level: Moderate (Score: -0.184)

Sentence: She enjoys reading books in the library.
Readability Level: Easy (Score: -0.046)

Sentence: We went to the zoo and saw many animals.
Readability Level: Easy (Score: 0.207)

Sentence: Apples and bananas are my favorite fruits.
Readability Level: Moderate (Score: -0.387)

Sentence: It is a sunny day, and the sky is blue.
Readability Level: Easy (Score: 0.093)

Sentence: He runs fast and jumps high in the competition.
Readability Level: Easy (Score: 0.055)

Sentence: The intricacies of quantum mechanics elude those unfamiliar with advanced physics.
Readability Level: Difficult (Score: -1.270)

Sentence: A comprehensive analysis of socio-economic disparities reveals multifaceted systemic inequities.
Readability Level: Difficult (Score: -2.096)

Sentence: The juxtaposition of baroque embellishments and modernist minimalism creates a paradoxical aesthetic.
Readability Level: Difficult (Score: -1.402)

Se