In [1]:
import polars as pl
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [2]:
# Download necessary NLTK data (only needs to be done once)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')


try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ruhwang/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/ruhwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# --- 1. Data Loading and Inspection ---
from typing import Dict


def load_data(file_path: str) -> pl.DataFrame:
    """Loads data from a CSV file into a Polars DataFrame.

    Args:
        file_path: Path to the JSON file.

    Returns:
        A Polars DataFrame.  Handles potential errors.
    """
    try:
        df = pl.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# --- 2. Basic Data Cleaning ---
def clean_text_dataframe(df: pl.DataFrame, text_column: str) -> pl.DataFrame:
    """Cleans text data: lowercase, removes special chars, emails, phone numbers.

    Args:
        df: The Polars DataFrame.
        text_column: The name of the column with the text.

    Returns:
        A Polars DataFrame with a 'processed_text' column (string type).
    """
    ldf = df.lazy()

    # Lowercasing
    ldf = ldf.with_columns(pl.col(text_column).str.to_lowercase().alias("processed_text"))

    # Removing Special Characters, Emails, and Phone Numbers
    def clean_text(text: str) -> str:
        if text is None:
            return ""
        text = re.sub(r'\S*@\S*\s?', '', text)
        text = re.sub(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', '', text)
        text = re.sub(r'\d{3}-\d{3}-\d{4}', '', text)
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keep spaces and alphanumeric
        return text

    ldf = ldf.with_columns(pl.col("processed_text").map_elements(clean_text, return_dtype=pl.String).alias("processed_text"))

    return ldf.collect()  # Return a DataFrame

# --- 3. Text Preprocessing (Part 2: Tokenization, Stopwords, Lemmatization) ---

def tokenize_and_process(df: pl.DataFrame) -> Dict:
    """Tokenizes, removes stopwords, and lemmatizes the 'processed_text' column.

    Args:
        df: Polars DataFrame from clean_text_dataframe (must have 'processed_text').

    Returns:
        A list of lists of strings (tokens).
    """

    # --- Download necessary NLTK data if not present
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')

    # --- setup
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def remove_stopwords(tokens: list) -> list:
        return [word for word in tokens if word not in stop_words]

    def lemmatize_tokens(tokens: list) -> list:
        return [lemmatizer.lemmatize(word) for word in tokens]

    # --- Processing as list of lists
    # Convert the 'processed_text' column to a Python list of strings
    text_list = df["processed_text"].to_list()
    score_list = df["score"].to_list()

    all_tokens = {}
    for text, score in zip(text_list, score_list):
        tokens = word_tokenize(text) #tokenize
        tokens = remove_stopwords(tokens) #remove stopwords
        tokens = lemmatize_tokens(tokens) # lemmatize
        all_tokens[text] = score

    return all_tokens

In [6]:
# Process Data
file_path = './data/google_play_reviews.csv'
df = load_data(file_path)

if df is not None:
    print(df.head())  # Display the first few rows to inspect the data
    print(df.schema) # show column and data types
    print(df.shape) # Check dimensions (rows, columns)

shape: (5, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ reviewId  ┆ userName  ┆ userImage ┆ content   ┆ … ┆ replyCont ┆ repliedAt ┆ sortOrder ┆ appId    │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ent       ┆ ---       ┆ ---       ┆ ---      │
│ str       ┆ str       ┆ str       ┆ str       ┆   ┆ ---       ┆ str       ┆ str       ┆ str      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ gp:AOqpTO ┆ Lex S     ┆ https://l ┆ I love    ┆ … ┆ Any.do is ┆ 2020-08-0 ┆ most_rele ┆ com.anyd │
│ HNHm4Ofbj ┆           ┆ h3.google ┆ this app, ┆   ┆ not only  ┆ 5         ┆ vant      ┆ o        │
│ kxEXXa51J ┆           ┆ userconte ┆ but I do  ┆   ┆ a product ┆ 08:02:08  ┆           ┆          │
│ wZE…      ┆           ┆ nt.…      ┆ have…     ┆   ┆ b…        ┆           

In [7]:
df.columns

['reviewId',
 'userName',
 'userImage',
 'content',
 'score',
 'thumbsUpCount',
 'reviewCreatedVersion',
 'at',
 'replyContent',
 'repliedAt',
 'sortOrder',
 'appId']

In [8]:
clean = clean_text_dataframe(df, "content")
print(clean.head())

shape: (5, 13)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ reviewId  ┆ userName  ┆ userImage ┆ content   ┆ … ┆ repliedAt ┆ sortOrder ┆ appId     ┆ processe │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ d_text   │
│ str       ┆ str       ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ gp:AOqpTO ┆ Lex S     ┆ https://l ┆ I love    ┆ … ┆ 2020-08-0 ┆ most_rele ┆ com.anydo ┆ i love   │
│ HNHm4Ofbj ┆           ┆ h3.google ┆ this app, ┆   ┆ 5         ┆ vant      ┆           ┆ this app │
│ kxEXXa51J ┆           ┆ userconte ┆ but I do  ┆   ┆ 08:02:08  ┆           ┆           ┆ but i do │
│ wZE…      ┆           ┆ nt.…      ┆ have…     ┆   ┆           ┆           

In [9]:
clean.columns

['reviewId',
 'userName',
 'userImage',
 'content',
 'score',
 'thumbsUpCount',
 'reviewCreatedVersion',
 'at',
 'replyContent',
 'repliedAt',
 'sortOrder',
 'appId',
 'processed_text']

In [10]:
def select_relevant_features(df: pl.DataFrame, cols: list[str] = ["processed_text", "score"]) -> pl.DataFrame:
    """
    Selects only the specified columns from a Polars DataFrame.

    Args:
        df: The input Polars DataFrame.
        cols: A list of column names to select.  Defaults to ["processed_text", "score"].

    Returns:
        A new Polars DataFrame containing only the selected columns.
        Returns None if any specified column is not found.
    """
    if not all(col in df.columns for col in cols):
        missing_cols = [col for col in cols if col not in df.columns]
        print(f"Error: The following columns are not present in the DataFrame: {missing_cols}")
        return None

    return df.select(cols)

In [11]:
tokenize_and_process(clean)

[nltk_data] Downloading package wordnet to /Users/ruhwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


{'i love this app but i do have one major gripe  i want an option to buy premium once even at a higher price and not as a monthly or yearly subscription edit  very unhappy with developer response to pricing i want to buy premium for this great product they made but its not a service this isnt netflix youre going to turn potential buyers away with this pricing model': 1,
 'trash yes it has some nice nifty features but it lacks in complete necessities to actually plan your day you can make a task perfectly include subtasks and then add a nice color to the task but no yellow because thats already the fixed color of the priority label reminders are great except you can only have one if i want to have a reminder on wednesday 10am and a reminder on friday 6pm id need a different app because this app only allows you to set 1 reminder frustrating 010': 1,
 'omg the ui is awful seriously you have popup for premium every 2 seconds theres no intuitive way see a normal calendar you have to create 

In [12]:
cleaned = select_relevant_features(clean)

In [13]:
text = clean.select(pl.col("processed_text"))

In [14]:
text.head()

processed_text
str
"""i love this app but i do have …"
"""trash yes it has some nice nif…"
"""omg the ui is awful seriously …"
"""ive been using the app for a w…"
"""unable to register with an ema…"


In [15]:
clean.select(pl.col("score")).describe()

statistic,score
str,f64
"""count""",16092.0
"""null_count""",0.0
"""mean""",3.052448
"""std""",1.309287
"""min""",1.0
"""25%""",2.0
"""50%""",3.0
"""75%""",4.0
"""max""",5.0


In [16]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
def create_features(df: pl.DataFrame, text_column: str = "processed_text",
                    use_tfidf: bool = True, use_embeddings: bool = True,
                    embedding_model_name: str = 'all-MiniLM-L6-v2', tfidf_max_features: int = 50) -> tuple[pl.DataFrame, list[str]]:
    """
    Creates a reduced set of features from the text data.

    Args:
        df: Polars DataFrame with cleaned text.
        text_column: Column with cleaned text.
        use_tfidf: Whether to use a limited set of TF-IDF features.
        use_embeddings: Whether to use sentence embeddings.
        embedding_model_name: SentenceTransformer model name.
        tfidf_max_features: Maximum number of TF-IDF features to keep.

    Returns:
        tuple: (DataFrame with features, list of feature names)
    """
    if text_column not in df.columns:
        print(f"Error: Text column '{text_column}' not found in DataFrame.")
        return None, None

    df = df.drop_nulls(subset=[text_column])
    df_features = df.clone()

    # --- 1. Basic Text Features ---
    df_features = df_features.with_columns(pl.col(text_column).str.split(" ").list.len().alias("word_count"))
    basic_features = ["word_count"]  # Keep only word_count

    # --- 2. TF-IDF (Reduced) ---
    # tfidf_features = []
    # if use_tfidf:
    #     tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)  # Significantly reduce features
    #     tfidf_matrix = tfidf_vectorizer.fit_transform(df_features[text_column].to_list()).toarray()
    #     tfidf_feature_names = [f"tfidf_{word}" for word in tfidf_vectorizer.get_feature_names_out()]
    #     tfidf_df = pl.DataFrame(tfidf_matrix, schema=tfidf_feature_names)
    #     df_features = pl.concat([df_features, tfidf_df], how="horizontal")
    #     tfidf_features = tfidf_feature_names

    # --- 3. Sentence Embeddings (Reduced Dimensionality) ---
    embedding_features = []
    if use_embeddings:
        try:
            model = SentenceTransformer(embedding_model_name)
            embeddings = model.encode(df_features[text_column].to_list())
            # Reduce dimensions using PCA (Principal Component Analysis)
            #   This is now done AFTER model training, so it is removed.
            embedding_feature_names = [f"embedding_{i}" for i in range(embeddings.shape[1])]  #Keep all embeddings
            embedding_df = pl.DataFrame(embeddings, schema=embedding_feature_names)
            df_features = pl.concat([df_features, embedding_df], how="horizontal")
            embedding_features = embedding_feature_names

        except Exception as e:
            print(f"Error creating embeddings: {e}. Skipping embeddings.")


    all_feature_names = basic_features + embedding_features # + tfidf_features
    return df_features, all_feature_names

In [18]:
df_features, all_feature_names = create_features(cleaned)

In [19]:
df_features.head()

processed_text,score,word_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11,embedding_12,embedding_13,embedding_14,embedding_15,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25,embedding_26,embedding_27,embedding_28,embedding_29,embedding_30,embedding_31,embedding_32,embedding_33,…,embedding_347,embedding_348,embedding_349,embedding_350,embedding_351,embedding_352,embedding_353,embedding_354,embedding_355,embedding_356,embedding_357,embedding_358,embedding_359,embedding_360,embedding_361,embedding_362,embedding_363,embedding_364,embedding_365,embedding_366,embedding_367,embedding_368,embedding_369,embedding_370,embedding_371,embedding_372,embedding_373,embedding_374,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383
str,i64,u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""i love this app but i do have …",1,72,-0.047453,-0.060031,-0.034426,-0.009703,-0.008174,0.063428,0.027241,0.017791,0.04605,0.047504,0.034316,0.049114,0.016172,-0.011512,0.064808,-0.049179,0.014111,-0.025833,-0.018018,-0.016402,-0.041497,-0.036361,-0.029669,0.019841,0.069844,-0.075518,-0.050072,0.046379,0.044743,-0.000308,0.042263,0.029296,0.043337,-0.024368,…,-0.000966,0.028419,-0.009314,-0.017118,0.030986,0.057432,-0.084941,-0.004941,-0.021422,-0.031107,-0.018906,-0.038331,0.001131,0.065106,0.003241,0.03925,-0.058629,0.046571,0.035767,-0.004846,-0.128384,0.130455,-0.003305,0.038528,0.042512,0.09225,0.006178,-0.021763,0.000984,0.005795,-0.068307,-0.070464,0.078519,0.081752,-0.058226,-0.005623,0.031395
"""trash yes it has some nice nif…",1,91,-0.079747,-0.02671,0.036071,0.011079,0.047783,0.026888,0.141307,-0.016001,0.031244,-0.028236,-0.107873,-0.032179,-0.001614,0.066751,0.052827,-0.0429,0.04429,-0.04469,-0.028178,-0.041436,-0.005961,-0.038388,0.075092,0.053493,-0.022579,0.026048,0.032108,-0.037488,0.024765,-0.070826,0.008426,0.05399,0.029501,-0.033927,…,0.122747,0.086368,0.062537,0.073482,0.026704,0.057684,-0.023055,0.004997,-0.021818,-0.037236,0.002421,-0.00071,0.035266,0.024598,-0.016853,-0.04847,-0.003764,-0.062504,-0.044694,-0.098011,-0.161701,-0.029775,0.001773,-0.089625,-0.002101,0.030992,0.042549,-0.001999,0.040085,0.060035,-0.06471,0.008922,0.0137,0.174862,0.027857,0.008931,0.022128
"""omg the ui is awful seriously …",1,58,-0.062107,0.023588,0.0093,0.033817,-0.02762,0.047502,-0.027526,0.003799,0.075574,-0.024239,-0.064885,-0.080283,-0.069809,0.029515,0.006226,-0.109445,-0.012951,-0.053632,0.005477,-0.009591,-0.0298,-0.138527,-0.036083,-0.011421,0.020266,-0.021816,-0.027546,0.092325,0.032676,0.094895,0.068517,-0.018851,0.050268,-0.069902,…,0.015858,0.025159,0.022465,0.016919,0.022832,0.087139,-0.10018,-0.038386,0.040057,-0.076847,0.002657,-0.023029,-0.01097,0.041079,-0.055557,0.076657,0.012015,-0.066442,0.049131,0.012014,-0.127499,0.066377,0.018277,-0.006179,-0.024294,0.049335,-0.05163,-0.012069,0.059753,0.041372,-0.08612,-0.003253,0.035595,0.063629,0.083599,-0.011801,0.043407
"""ive been using the app for a w…",1,37,-0.033075,-0.103079,0.037433,-0.009245,0.031375,-0.016156,0.031458,0.017541,0.058086,0.02334,-0.096206,-0.03061,-0.012269,0.063373,0.004026,-0.00927,0.016077,-0.035612,-0.089467,-0.005271,-0.050627,-0.065716,0.102525,0.113137,0.008474,-0.09335,-0.065243,-0.008072,0.025232,0.009648,0.024615,0.079125,-0.062771,-0.018996,…,0.07558,-0.025138,0.022439,0.062073,-0.005681,0.086332,-0.049178,-0.01674,0.044007,0.013544,-0.056906,0.051662,0.005229,0.096855,-0.025587,0.03109,0.0291,-0.040724,-0.033398,0.076685,-0.063152,0.063978,0.010475,-0.025734,-0.037836,0.013043,-0.026016,0.033077,-0.034866,0.002653,-0.125522,0.045003,-0.032204,0.08709,0.004783,-0.059723,0.057431
"""unable to register with an ema…",1,61,-0.063726,-0.114863,0.009041,-0.021613,0.051421,-0.018601,0.013461,-0.02951,-0.060776,-0.035377,0.002636,0.004348,-0.026516,0.025701,0.014903,0.075085,-0.043391,-0.029722,0.001646,0.044516,-0.090537,-0.087035,-0.014222,0.040279,-0.013841,-0.146024,-0.06247,-0.027728,0.022735,0.026931,0.123657,0.053006,0.032405,-0.052358,…,0.07786,0.003339,-0.030116,-0.039828,-0.031946,-0.071069,-0.102714,-0.076674,0.024339,-0.010812,-0.086327,-0.028348,-0.160422,0.10068,0.104388,0.131993,-0.029724,-0.075086,0.041978,-0.006346,0.028814,0.080522,0.009935,0.012038,0.077384,0.038326,0.033069,-0.005689,0.008543,0.018891,-0.021314,0.082656,0.011239,0.028442,0.039011,-0.008461,-0.000532


In [20]:
from sklearn.decomposition import PCA

In [28]:
def split_and_engineer_datasets(df: pl.DataFrame, feature_cols: list, score_column: str = "score",
                       poly_degree: int = 3, test_size: float = 0.25, val_size: float = 0.1,
                       n_components: int = 25): 
    """
    Trains, evaluates, and performs dimensionality reduction.
    """
    # if score_column not in df.columns:
    #     print(f"Error: Score column '{score_column}' not found in DataFrame")
    #     return None, None, None, None, None
    # if not set(feature_cols).issubset(df.columns):
    #     missing_cols = set(feature_cols) - set(df.columns)
    #     print(f"Error: Feature columns {missing_cols} not found in DataFrame")
    #     return None, None, None, None, None

    df = df.drop_nulls() # subset=feature_cols + [score_column]
    X = df.select(feature_cols).to_numpy()
    y = df.select(score_column).to_numpy().flatten()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=0)

    # --- Dimensionality Reduction (PCA) - AFTER splitting ---
    # Identify embedding columns
    embedding_cols = [col for col in feature_cols if col.startswith("embedding_")]
    non_embedding_cols = [col for col in feature_cols if not col.startswith("embedding_")]

    if embedding_cols: # Only do PCA if there are embeddings
        # Separate embedding features and non-embedding features
        X_train_embed = X_train[:, [feature_cols.index(col) for col in embedding_cols]]
        X_val_embed = X_val[:, [feature_cols.index(col) for col in embedding_cols]]
        X_test_embed = X_test[:, [feature_cols.index(col) for col in embedding_cols]]

        X_train_non_embed = X_train[:, [feature_cols.index(col) for col in non_embedding_cols]]
        X_val_non_embed = X_val[:, [feature_cols.index(col) for col in non_embedding_cols]]
        X_test_non_embed = X_test[:, [feature_cols.index(col) for col in non_embedding_cols]]

        # Apply PCA to embedding features ONLY
        pca = PCA(n_components=n_components)
        X_train_embed_reduced = pca.fit_transform(X_train_embed)  # Fit on training data
        X_val_embed_reduced = pca.transform(X_val_embed)        # Transform validation data
        X_test_embed_reduced = pca.transform(X_test_embed)       # Transform test data

        # Combine reduced embeddings with other features
        X_train_combined = np.concatenate([X_train_non_embed, X_train_embed_reduced], axis=1)
        X_val_combined = np.concatenate([X_val_non_embed, X_val_embed_reduced], axis=1)
        X_test_combined = np.concatenate([X_test_non_embed, X_test_embed_reduced], axis=1)

        # Update feature names
        reduced_embedding_feature_names = [f"pca_embedding_{i}" for i in range(n_components)]
        feature_cols_combined = non_embedding_cols + reduced_embedding_feature_names

    else: #if no embeddings, just combine normally
        X_train_combined = X_train
        X_val_combined = X_val
        X_test_combined = X_test
        feature_cols_combined = feature_cols

    # --- Polynomial Features ---
    poly = PolynomialFeatures(degree=poly_degree)
    X_train_poly = poly.fit_transform(X_train_combined)
    X_val_poly = poly.transform(X_val_combined)
    X_test_poly = poly.transform(X_test_combined)
    poly_feature_names = poly.get_feature_names_out(feature_cols_combined)

    return X_train_poly, y_train, X_test_poly, y_test, X_val_poly, y_val, poly_feature_names

In [29]:
cols = [c for c in df_features.columns if c not in ["processed_text", "score"]]

In [30]:
x_train, y_train, x_test, y_test, x_val, y_val, poly_feature_names = split_and_engineer_datasets(df_features, cols)

In [31]:
# --- Model Training and Evaluation ---

def train_and_evaluate(x_train, y_train, x_val, y_test):
    """
    Trains and evaluates a polynomial regression model.

    Args:
        df: Polars DataFrame with features and score.
        feature_cols: List of feature column names.
        score_column: Column with the target score.
        poly_degree: Degree for polynomial features.
        test_size: proportion for test set
        val_size: proportion for validation set (from training data)

    Returns:
        tuple: (trained model, polynomial feature names, train metrics, validation metrics, test metrics)"
    """
    # --- Model Training ---
    model = LinearRegression()
    model.fit(x_train, y_train)

    # --- Evaluation ---
    y_train_pred = model.predict(x_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    train_metrics = {"RMSE": train_rmse, "R2": train_r2}

    y_val_pred = model.predict(x_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    val_r2 = r2_score(y_val, y_val_pred)
    val_metrics = {"RMSE": val_rmse, "R2": val_r2}

    y_test_pred = model.predict(x_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    test_metrics = {"RMSE": test_rmse, "R2": test_r2}

    return model, poly_feature_names, train_metrics, val_metrics, test_metrics

In [32]:
model, poly_feature_names, train_metrics, val_metrics, test_metrics = train_and_evaluate(x_train, y_train, x_val, y_test)

In [33]:
train_metrics

{'RMSE': np.float64(0.6988818964763621), 'R2': 0.7171129861351382}

In [34]:
val_metrics

{'RMSE': np.float64(1.048829263263843), 'R2': 0.35547382502520064}

Potential overfitting

In [35]:
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import StandardScaler  # Import StandardScaler

def cluster_with_bgmm(df: pl.DataFrame, feature_cols: list, k_max: int = 10, random_state: int = 42) -> tuple[np.ndarray, np.ndarray]:
    """
    Performs clustering using Bayesian Gaussian Mixture Models (BGMM).

    Args:
        df: Polars DataFrame containing the features.
        feature_cols: List of column names to use as features.
        k_max:  Maximum number of clusters (BGMM will infer the actual number).
        random_state: Random seed for reproducibility.

    Returns:
        tuple: (probabilities, cluster_labels)
            probabilities: NumPy array of shape (n_samples, n_clusters)
                           with probabilities of each point belonging to each cluster.
            cluster_labels: NumPy array of shape (n_samples,) with the
                           most likely cluster assignment for each point.
            Returns (None, None) if there's an error.
    """
    if not set(feature_cols).issubset(df.columns):
        missing_cols = set(feature_cols) - set(df.columns)
        print(f"Error: Feature columns {missing_cols} not found in DataFrame.")
        return None, None

    # Drop rows with missing values in the feature columns
    df = df.drop_nulls(subset=feature_cols)

    # Extract features as a NumPy array
    X = df.select(feature_cols).to_numpy()

    # --- Data Scaling (Important for GMMs) ---
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # --- Bayesian Gaussian Mixture Model ---
    bgmm = BayesianGaussianMixture(
        n_components=k_max,
        random_state=random_state,
        weight_concentration_prior_type="dirichlet_process"  # For automatic cluster number inference
    )
    bgmm.fit(X_scaled)  # Fit on the *scaled* data

    # --- Get Probabilities and Cluster Labels ---
    probabilities = bgmm.predict_proba(X_scaled)  # Probabilities for each point and cluster
    cluster_labels = bgmm.predict(X_scaled)  # Hard cluster assignments

    return probabilities, cluster_labels

# --- Example Usage (assuming you have 'df_with_features' and 'feature_names') ---


In [36]:
type(df_features)

polars.dataframe.frame.DataFrame

In [37]:
x_train.shape

(10862, 3654)

In [38]:
probabilities, cluster_labels = cluster_with_bgmm(df_features, cols, k_max=5) # Example: Max 5 clusters

if probabilities is not None and cluster_labels is not None:
    print("Probabilities (first 5 rows):\n", probabilities[:5])
    print("\nCluster Labels (first 5 rows):\n", cluster_labels[:5])

    # Add cluster labels and probabilities to the DataFrame (optional)
    # df_with_clusters = x_train.with_columns(pl.Series(name="cluster", values=cluster_labels))

    # Convert probabilities to a Polars DataFrame for easier handling
    probabilities_df = pl.DataFrame(probabilities)
    probabilities_df.columns = [f"prob_cluster_{i}" for i in range(probabilities.shape[1])]

    # Concatenate
    df_with_clusters = pl.concat([df_features, probabilities_df], how = "horizontal")
    print("\nDataFrame with Cluster Labels and Probabilities (first 5 rows):\n", df_with_clusters.head())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Probabilities (first 5 rows):
 [[1.00000000e+000 1.85661604e-114 7.57063200e-159 3.86163208e-108
  1.84626146e-199]
 [1.33702028e-114 1.00000000e+000 2.23382957e-054 6.15668423e-050
  2.83226993e-113]
 [3.13076168e-030 1.00000000e+000 1.16237386e-038 1.61144493e-082
  6.83303331e-175]
 [1.20551563e-076 1.00000000e+000 2.20812490e-014 1.53919249e-046
  4.84577203e-165]
 [1.59261691e-158 4.69984043e-212 1.00000000e+000 4.11251587e-262
  1.15812063e-250]]

Cluster Labels (first 5 rows):
 [0 1 1 1 2]

DataFrame with Cluster Labels and Probabilities (first 5 rows):
 shape: (5, 392)
┌────────────┬───────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ processed_ ┆ score ┆ word_count ┆ embedding_ ┆ … ┆ prob_clus ┆ prob_clus ┆ prob_clus ┆ prob_clus │
│ text       ┆ ---   ┆ ---        ┆ 0          ┆   ┆ ter_1     ┆ ter_2     ┆ ter_3     ┆ ter_4     │
│ ---        ┆ i64   ┆ u32        ┆ ---        ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ str       

In [43]:
def get_max_prob_cluster(df: pl.DataFrame) -> pl.DataFrame:
    """
    Finds the maximum probability among 'prob_cluster' columns and assigns the relevant
    cluster number to a new 'cluster' column.
    """
    prob_cols = [col for col in df.columns if col.startswith("prob_cluster")]
    if not prob_cols:
        print("Error: No 'prob_cluster' columns found in DataFrame.")
        return df
    
    # Use LazyFrame for potential optimization
    ldf = df.lazy() 

    # Ensure correct data types (optional, but good practice)
    for col in prob_cols:
        if ldf.schema[col] != pl.Float64:  # Use .schema on LazyFrame
            ldf = ldf.with_columns(pl.col(col).cast(pl.Float64))

    max_prob_expr = pl.max_horizontal(prob_cols)
    when_expressions = [
        pl.when(pl.col(col) == max_prob_expr).then(i)
        for i, col in enumerate(prob_cols)
    ]
    cluster_number_expr = pl.coalesce(when_expressions).alias("cluster")

    ldf = ldf.with_columns(cluster_number_expr)
    # Cast to Categorical
    # ldf = ldf.with_columns(pl.col("cluster")# .cast(pl.Categorical))  

    # Collect at the very end
    return ldf.collect() 

In [44]:
final = get_max_prob_cluster(df_with_clusters)
final

  if ldf.schema[col] != pl.Float64:  # Use .schema on LazyFrame


processed_text,score,word_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11,embedding_12,embedding_13,embedding_14,embedding_15,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25,embedding_26,embedding_27,embedding_28,embedding_29,embedding_30,embedding_31,embedding_32,embedding_33,…,embedding_353,embedding_354,embedding_355,embedding_356,embedding_357,embedding_358,embedding_359,embedding_360,embedding_361,embedding_362,embedding_363,embedding_364,embedding_365,embedding_366,embedding_367,embedding_368,embedding_369,embedding_370,embedding_371,embedding_372,embedding_373,embedding_374,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383,prob_cluster_0,prob_cluster_1,prob_cluster_2,prob_cluster_3,prob_cluster_4,cluster
str,i64,u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f64,i32
"""i love this app but i do have …",1,72,-0.047453,-0.060031,-0.034426,-0.009703,-0.008174,0.063428,0.027241,0.017791,0.04605,0.047504,0.034316,0.049114,0.016172,-0.011512,0.064808,-0.049179,0.014111,-0.025833,-0.018018,-0.016402,-0.041497,-0.036361,-0.029669,0.019841,0.069844,-0.075518,-0.050072,0.046379,0.044743,-0.000308,0.042263,0.029296,0.043337,-0.024368,…,-0.084941,-0.004941,-0.021422,-0.031107,-0.018906,-0.038331,0.001131,0.065106,0.003241,0.03925,-0.058629,0.046571,0.035767,-0.004846,-0.128384,0.130455,-0.003305,0.038528,0.042512,0.09225,0.006178,-0.021763,0.000984,0.005795,-0.068307,-0.070464,0.078519,0.081752,-0.058226,-0.005623,0.031395,1.0,1.8566e-114,7.5706e-159,3.8616e-108,1.8463e-199,0
"""trash yes it has some nice nif…",1,91,-0.079747,-0.02671,0.036071,0.011079,0.047783,0.026888,0.141307,-0.016001,0.031244,-0.028236,-0.107873,-0.032179,-0.001614,0.066751,0.052827,-0.0429,0.04429,-0.04469,-0.028178,-0.041436,-0.005961,-0.038388,0.075092,0.053493,-0.022579,0.026048,0.032108,-0.037488,0.024765,-0.070826,0.008426,0.05399,0.029501,-0.033927,…,-0.023055,0.004997,-0.021818,-0.037236,0.002421,-0.00071,0.035266,0.024598,-0.016853,-0.04847,-0.003764,-0.062504,-0.044694,-0.098011,-0.161701,-0.029775,0.001773,-0.089625,-0.002101,0.030992,0.042549,-0.001999,0.040085,0.060035,-0.06471,0.008922,0.0137,0.174862,0.027857,0.008931,0.022128,1.3370e-114,1.0,2.2338e-54,6.1567e-50,2.8323e-113,1
"""omg the ui is awful seriously …",1,58,-0.062107,0.023588,0.0093,0.033817,-0.02762,0.047502,-0.027526,0.003799,0.075574,-0.024239,-0.064885,-0.080283,-0.069809,0.029515,0.006226,-0.109445,-0.012951,-0.053632,0.005477,-0.009591,-0.0298,-0.138527,-0.036083,-0.011421,0.020266,-0.021816,-0.027546,0.092325,0.032676,0.094895,0.068517,-0.018851,0.050268,-0.069902,…,-0.10018,-0.038386,0.040057,-0.076847,0.002657,-0.023029,-0.01097,0.041079,-0.055557,0.076657,0.012015,-0.066442,0.049131,0.012014,-0.127499,0.066377,0.018277,-0.006179,-0.024294,0.049335,-0.05163,-0.012069,0.059753,0.041372,-0.08612,-0.003253,0.035595,0.063629,0.083599,-0.011801,0.043407,3.1308e-30,1.0,1.1624e-38,1.6114e-82,6.8330e-175,1
"""ive been using the app for a w…",1,37,-0.033075,-0.103079,0.037433,-0.009245,0.031375,-0.016156,0.031458,0.017541,0.058086,0.02334,-0.096206,-0.03061,-0.012269,0.063373,0.004026,-0.00927,0.016077,-0.035612,-0.089467,-0.005271,-0.050627,-0.065716,0.102525,0.113137,0.008474,-0.09335,-0.065243,-0.008072,0.025232,0.009648,0.024615,0.079125,-0.062771,-0.018996,…,-0.049178,-0.01674,0.044007,0.013544,-0.056906,0.051662,0.005229,0.096855,-0.025587,0.03109,0.0291,-0.040724,-0.033398,0.076685,-0.063152,0.063978,0.010475,-0.025734,-0.037836,0.013043,-0.026016,0.033077,-0.034866,0.002653,-0.125522,0.045003,-0.032204,0.08709,0.004783,-0.059723,0.057431,1.2055e-76,1.0,2.2081e-14,1.5392e-46,4.8458e-165,1
"""unable to register with an ema…",1,61,-0.063726,-0.114863,0.009041,-0.021613,0.051421,-0.018601,0.013461,-0.02951,-0.060776,-0.035377,0.002636,0.004348,-0.026516,0.025701,0.014903,0.075085,-0.043391,-0.029722,0.001646,0.044516,-0.090537,-0.087035,-0.014222,0.040279,-0.013841,-0.146024,-0.06247,-0.027728,0.022735,0.026931,0.123657,0.053006,0.032405,-0.052358,…,-0.102714,-0.076674,0.024339,-0.010812,-0.086327,-0.028348,-0.160422,0.10068,0.104388,0.131993,-0.029724,-0.075086,0.041978,-0.006346,0.028814,0.080522,0.009935,0.012038,0.077384,0.038326,0.033069,-0.005689,0.008543,0.018891,-0.021314,0.082656,0.011239,0.028442,0.039011,-0.008461,-0.000532,1.5926e-158,4.6998e-212,1.0,4.1125e-262,1.1581e-250,2
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""have used for several years it…",5,19,-0.028173,-0.067874,-0.010964,-0.025066,0.015712,0.005331,-0.078747,0.044256,-0.05055,0.019849,-0.091338,0.118173,0.034326,0.008396,-0.059352,0.005649,-0.009678,-0.043042,0.107165,-0.039939,-0.029935,-0.021414,-0.005722,-0.041048,0.031934,0.037421,0.036501,0.048507,0.002199,-0.030589,-0.028098,0.099723,0.041456,-0.058372,…,0.009597,0.03738,0.020403,-0.014709,0.038068,0.013824,-0.010498,0.036954,0.008105,-0.054729,0.039203,0.043877,-0.023841,0.049228,-0.087317,0.044579,-0.062512,-0.071843,-0.001968,0.011855,0.061582,0.090759,0.006422,0.016775,-0.05127,-0.002725,0.07141,-0.104607,-0.013554,-0.084408,0.039908,2.3799e-266,2.5367e-36,2.7293e-162,9.6424e-40,1.0,4
"""love it love that it keeps wha…",5,30,-0.111257,0.098506,0.075841,-0.008457,0.012321,0.018893,0.03667,-0.060696,0.029017,0.00776,-0.021641,0.080044,0.043017,0.056948,-0.000882,-0.003079,-0.074527,0.013344,-0.000712,0.006652,-0.06956,-0.038563,0.069685,0.034528,-0.052297,0.099155,-0.061438,-0.016871,-0.010531,-0.109501,-0.035919,0.017694,0.018878,0.006097,…,0.013154,-0.054352,-0.040634,-0.005155,-0.029567,0.009648,-0.01992,0.052549,-0.034907,-0.080757,0.028887,-0.020718,0.023524,0.052893,-0.089089,-0.035043,0.025829,-0.014004,-0.073912,0.0844,0.064782,0.05024,0.065906,0.036663,0.066429,0.042005,-0.006813,0.041719,0.054051,-0.046305,0.054649,3.1136e-215,6.0539e-65,3.7055e-182,1.4532e-108,1.0,4
"""great app""",5,2,-0.014104,0.017174,-0.002966,-0.046516,-0.017019,0.008789,0.051407,0.06125,0.013383,0.006914,0.026146,0.062958,-0.012255,-0.020339,-0.01556,-0.014934,0.048807,-0.007361,-0.034061,-0.092198,-0.071585,-0.008905,0.081074,0.01151,-0.034854,0.006535,-0.089001,0.003334,0.097457,-0.007563,-0.006297,-0.0075,0.068012,0.00893,…,-0.036246,0.006249,0.114489,-0.038381,0.024407,0.081155,-0.026367,-0.00803,-0.015107,0.013696,-0.099559,-0.018611,-0.013202,0.107969,-0.069055,0.058387,0.045398,0.005866,0.024816,0.038465,-0.037332,0.076489,0.03995,0.067182,-0.10647,0.037316,0.084856,0.0928,0.031532,-0.026777,0.067294,0.0,4.5948e-322,8.6948e-315,1.0,7.7620e-153,3
"""very helpful and users friendl…",5,6,-0.060244,-0.000202,0.000366,-0.02906,-0.028889,0.046181,0.076848,0.10218,-0.060577,-0.03258,0.001629,0.040058,0.026114,-0.018395,0.007373,0.013536,0.112137,-0.018703,-0.037492,-0.088012,-0.046279,0.011306,0.067366,0.03854,-0.014135,-0.050983,-0.054365,0.030509,0.085719,0.018947,-0.017,0.009404,0.021001,0.023793,…,-0.015327,0.01543,0.092056,0.034376,-0.021185,-0.008955,-0.065771,0.048189,-0.025592,0.031272,-0.006053,0.01102,-0.020719,0.122256,-0.136105,0.04829,0.02162,-0.019329,-0.033439,0.059711,-0.069456,0.045516,0.082173,-0.003607,-0.040826,0.057349,0.051784,0.05523,0.077177,0.011757,0.060161,3.6971e-167,3.9866e-82,5.5010e-142,1.0,1.6079e-94,3


In [49]:
def sample_sentences_from_clusters(df: pl.DataFrame, cluster_column: str, n_samples: int = 10, random_state: int = None) -> pl.DataFrame:
    """
    Randomly samples N rows from each cluster in a Polars DataFrame (Eager Mode).

    Args:
        df: The input Polars DataFrame.
        cluster_column: The name of the column with cluster assignments.
        n_samples: The number of samples to take per cluster.
        random_state: Optional random seed.

    Returns:
        A new Polars DataFrame with the sampled rows.  Returns an empty DataFrame on error.
    """
    if cluster_column not in df.columns:
        print(f"Error: Cluster column '{cluster_column}' not found.")
        return pl.DataFrame()

    if df[cluster_column].dtype not in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64]:
        df = df.with_columns(pl.col(cluster_column).cast(pl.Int32))

    # Use a list comprehension for clarity and efficiency
    sampled_dfs = [
        df.filter(pl.col(cluster_column) == cluster)
        .sample(n=n_samples, with_replacement=False, seed=random_state, shuffle=True)
        for cluster in df.get_column(cluster_column).unique()
    ]
    # Concatenate the sampled DataFrames
    return pl.concat(sampled_dfs)

performance = ["speed", "responsiveness", "working"]
price = ["subscription", "cost", "pricing", "premium"]
ui = ["design", "looks", "good-looking"]

compare each word to the average vector (centroid) of each of the features above. 

And then use rake to give us the label. 

(ie) "i really like it the reason im giving a 4 star is that i wish the focus promo could be set for more than 120 mins 120 mins means ive got to keep changing it againg and againg for longer focus periods which breaks the focus ps im using the free version i dont know whether it available on the premium version or not."

|—red— (price)| — green (performance)|

(ie2)
|—red— (performance)| — green (ui)|

[traditional approach]
label each cluster like "performance", "price" by ourselves.

[deep learning approach]
prompt gpt to assign a sentiment score and the cluster label for each sub-sentence.

BERT for sentiment analysis 
GPT for topic assigning

GPT output: {subsentence1: {'performance', 1'}, subsentence2: {"ui, -1"}}, GPT input: each sub-sentence, linear regression for each scoring, 

Using GPT labels, train a multiclassification model to assign each key word for heatmap generation. 

for each entire review string:
    green, label = "performance" for positive review (topic is about "performance")
    red, label = "price" for negative review (topic is about "price")

In [None]:
sampled_final = sample_sentences_from_clusters(final, "cluster")
sampled_final

processed_text,score,word_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11,embedding_12,embedding_13,embedding_14,embedding_15,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25,embedding_26,embedding_27,embedding_28,embedding_29,embedding_30,embedding_31,embedding_32,embedding_33,…,embedding_353,embedding_354,embedding_355,embedding_356,embedding_357,embedding_358,embedding_359,embedding_360,embedding_361,embedding_362,embedding_363,embedding_364,embedding_365,embedding_366,embedding_367,embedding_368,embedding_369,embedding_370,embedding_371,embedding_372,embedding_373,embedding_374,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383,prob_cluster_0,prob_cluster_1,prob_cluster_2,prob_cluster_3,prob_cluster_4,cluster
str,i64,u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f64,i32
"""it is better if dont have buy …",2,16,-0.068312,-0.04847,0.01716,-0.071224,0.086245,0.045599,-0.025234,0.108211,0.012332,0.0895,0.044412,0.057498,-0.023147,-0.035722,0.101885,0.007979,0.087252,-0.046616,-0.027425,-0.029841,-0.071511,-0.063712,0.037078,0.063393,0.062284,-0.068793,-0.07183,0.096331,0.015943,0.050485,0.002747,0.001428,0.007465,-0.008187,…,-0.047492,0.099531,-0.011397,0.040135,0.061109,0.001165,0.029691,0.046625,-0.017938,0.025185,-0.046796,-0.014344,-0.020852,-0.015262,-0.119878,0.043822,0.044284,-0.041539,-0.025636,-0.000341,-0.042981,0.070467,0.052382,-0.005408,-0.081609,0.044916,0.090535,0.103029,-0.014137,-0.018891,0.016686,1.0,3.0507e-116,2.1060e-69,5.9283e-42,1.8751e-150,0
"""i have enough credit but i can…",3,10,-0.043432,-0.011499,-0.022535,-0.017385,0.04132,0.038544,0.035625,-0.01246,0.01315,0.032255,0.061753,-0.065906,-0.026937,-0.035274,0.059712,0.014468,-0.026091,0.056838,0.01743,0.021808,-0.087784,-0.017142,-0.055838,-0.012934,0.031341,-0.030408,-0.037021,0.064189,0.035784,-0.00822,0.073224,-0.025629,0.120042,-0.03353,…,-0.134335,-0.012264,-0.065384,0.024484,-0.014076,-0.045014,-0.044299,0.000073,-0.04355,0.041261,-0.06512,0.032948,0.044733,-0.014842,-0.101771,0.083444,0.016022,-0.006314,0.014784,0.056112,-0.009225,-0.055344,0.011813,-0.015423,-0.000973,-0.035995,0.029709,0.057303,-0.062822,0.006566,-0.034964,1.0,9.7463e-195,4.3012e-169,8.5746e-213,1.2717e-192,0
"""i really like it the reason im…",4,64,-0.047447,-0.063284,-0.001835,-0.018367,0.062304,0.065805,0.06339,-0.025179,0.008796,0.03935,0.03057,0.03813,-0.054415,-0.018087,0.056847,0.009588,0.015157,-0.027301,0.00336,-0.020631,0.000361,-0.143796,0.016452,0.001438,0.054362,-0.065157,-0.063854,-0.012933,0.052495,-0.079652,0.004039,0.034539,0.153947,-0.008762,…,-0.073674,-0.129118,-0.040982,0.043245,0.048011,-0.021877,0.152486,0.007443,-0.062631,0.040536,0.002028,-0.084725,0.034349,0.005548,-0.155885,0.061873,-0.022606,-0.006373,0.01982,0.022352,0.002413,0.057153,0.104992,0.068058,0.022849,0.000507,0.024402,0.051753,-0.004833,-0.090341,0.0964,1.0,1.0417e-84,1.3651e-82,4.2565e-36,1.0982e-97,0
"""the free version was great unt…",2,42,-0.110508,-0.000475,0.092905,0.00792,0.077341,0.025413,-0.025552,0.033838,0.040654,-0.011584,-0.101279,0.014481,-0.080003,-0.032552,0.054968,-0.091267,0.061778,-0.062928,0.027622,-0.016938,-0.074992,-0.070837,0.052656,0.066568,0.032659,-0.061026,-0.030929,-0.04099,0.058769,0.046164,-0.010627,0.032049,0.028161,-0.052484,…,-0.023224,-0.007504,-0.059938,-0.080296,0.010758,0.035478,0.025623,0.09289,0.052863,0.049977,0.003955,-0.04325,0.030232,0.015086,-0.151982,0.071641,0.060661,-0.020374,-0.017736,0.043782,-0.050451,0.071367,0.068063,0.045725,-0.044125,0.065362,0.092345,0.031206,0.009497,-0.040708,0.025748,1.0,5.6748e-43,8.4348e-128,2.8773e-93,1.1910e-165,0
"""if this was free yeah id total…",1,21,-0.160618,0.010912,-0.005403,0.000473,0.127687,0.031816,0.047599,0.00204,0.054274,0.04176,0.023096,-0.066609,-0.01037,-0.008914,-0.069443,-0.016706,-0.013665,0.026306,-0.01463,0.0095,-0.081131,-0.03397,0.03526,0.035036,0.036637,0.004474,0.030843,-0.004837,0.016496,0.045912,0.051505,0.028742,0.026874,0.001311,…,0.028438,-0.022783,-0.05598,-0.093876,-0.019785,-0.032509,0.037334,0.044886,0.067765,0.018223,0.010537,-0.001562,-0.017741,0.007708,-0.154289,0.003994,-0.023266,-0.034959,-0.049448,0.088443,0.051241,0.071124,0.041917,0.074945,0.05516,-0.006575,0.071303,0.063712,-0.015489,-0.06992,-0.034734,1.0,2.9480e-134,3.8341e-213,5.6849e-97,1.4938e-70,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""poor interface""",1,2,0.032339,0.00269,0.038575,0.03416,-0.046246,-0.019045,0.012937,0.024144,-0.066709,-0.033189,0.03407,-0.058092,-0.043823,-0.012076,-0.053645,-0.01819,0.021473,-0.107426,-0.025302,0.020381,-0.050419,-0.000025,0.000751,-0.026488,-0.025885,0.018143,0.058047,-0.008323,0.022868,-0.071794,-0.018736,0.061433,-0.073136,-0.015602,…,0.033891,-0.048296,0.069198,-0.015451,0.052694,-0.029919,-0.073949,-0.036607,-0.020313,-0.054502,-0.065892,0.051597,-0.005164,0.027407,0.01549,0.097725,-0.024606,-0.07861,-0.007642,-0.011113,-0.039965,0.013434,-0.042239,0.004931,0.008862,-0.022584,0.023017,0.003395,0.000048,-0.035715,0.017753,0.0,1.1281e-157,2.2572e-76,1.2093e-102,1.0,4
"""itssa ok""",3,2,-0.068477,0.021948,-0.096182,-0.057766,-0.036918,0.030586,0.076428,0.027376,-0.041019,0.116234,0.03338,-0.048925,-0.044482,0.019871,-0.059469,-0.105481,0.025141,-0.102133,-0.07903,-0.031763,-0.07774,0.023131,-0.041322,-0.029511,-0.043499,-0.020625,-0.015094,-0.029349,-0.008884,-0.085997,-0.003263,-0.005974,0.096526,0.041458,…,-0.042556,-0.008349,0.030108,-0.013785,-0.013007,0.054274,-0.044944,0.048149,0.046362,0.056873,0.039517,-0.093006,0.108672,0.088552,0.002164,-0.028027,-0.012577,-0.054444,-0.035363,0.023401,-0.019773,0.061459,-0.049265,0.029242,0.056141,0.011883,-0.028974,0.12352,-0.02435,-0.028765,0.090619,0.0,0.0,6.5073e-260,2.5390e-293,1.0,4
"""i didnt get it but i dont know…",3,15,-0.019943,0.02646,0.00393,0.029054,0.052046,0.035669,0.066246,0.001497,-0.029103,0.006127,0.05432,-0.071922,0.002611,-0.074615,-0.048333,-0.033484,0.009227,-0.060971,-0.024037,-0.060819,-0.047658,0.035712,0.086542,-0.006811,-0.022914,-0.066028,-0.017509,0.031285,-0.007636,0.068008,0.110217,0.118438,0.139527,0.036791,…,-0.036034,0.021602,0.098406,-0.063771,-0.049284,0.034586,-0.022601,0.012294,-0.027836,0.019641,0.043324,-0.017732,0.080191,-0.03307,-0.024571,-0.014402,0.024353,0.011466,-0.072637,0.003772,0.004592,-0.00988,-0.079698,0.01896,0.05105,0.038713,-0.030907,0.065551,0.08595,-0.019968,0.001658,0.0,3.8255e-242,0.0,7.2611e-299,1.0,4
"""chill and easy good for keepin…",5,10,-0.042511,-0.01357,0.001038,0.09579,0.026239,0.05366,0.129193,0.0207,-0.027347,-0.038267,0.042448,-0.034459,0.048185,-0.001162,0.041258,-0.074299,0.006422,0.043681,-0.026029,-0.053233,-0.084499,0.027456,-0.033183,0.054177,-0.077336,0.108193,-0.020104,-0.0466,-0.005262,0.018101,-0.030813,0.069539,0.03592,-0.012219,…,-0.025496,-0.010762,-0.068894,0.052435,-0.034082,0.001867,-0.003476,-0.05764,0.010699,-0.063137,0.104183,-0.114451,-0.083828,0.060555,-0.032176,-0.011469,0.029905,0.018959,-0.023085,0.063969,0.098041,-0.057442,0.04357,0.033993,0.027363,-0.04542,0.021368,0.028961,0.025589,-0.131762,0.013032,3.7382e-307,2.0307e-117,6.3958e-165,2.3606e-139,1.0,4


In [54]:
def display_full_text(df: pl.DataFrame, text_column: str):
    """
    Displays the full text from a specified column, without truncation.
    (Simplest version, no row numbers if one-liner isn't feasible).
    """
    if text_column not in df.columns:
        print(f"Error: Text column '{text_column}' not found.")
        return

    for text in df.get_column(text_column):
        print(text)

i really like it the reason im giving a 4 star is that i wish the focus promo could be set for more than 120 mins 120 mins means ive got to keep changing it againg and againg for longer focus periods which breaks the focus ps im using the free version i dont know whether it available on the premium version or not.

make a table for each cluster. 
part 1: cluster 0 (ask ChatGPT what this is)
part 2: cluster 3 
part 3: cluster 1

Deep learning: (pass entire subsentence to chatgpt, which can do the sentiment and the classification by itself) 

ML: we do clustering for categorizing (need human labeling) and linear reg for score prediction.


In [57]:
display_full_text(sampled_final, "processed_text") # .select("processed_text")

it is better if dont have buy pro version hoping an app free offline soon 
i have enough credit but i cant purchase for premium
i really like it the reason im giving a 4 star is that i wish the focus promo could be set for more than 120 mins 120 mins means ive got to keep changing it againg and againg for longer focus periods which breaks the focus ps im using the free version i dont know whether it available on the premium version or not
the free version was great until now but since they added advertisements to it e g forced to watch an ad after creating an appointment i cannot recommend it anymore it is absolutely annoying the google calendar is the better alternative now
if this was free yeah id totally use this who honestly wants to pay to keep track of your life wth
i used to love this app and used the whitelist feature all the time so i could study pdfs on my phone i redownloaded it recently and discovered they now put the whitelist feature behind a paywall which basically defea