# Explore here

In [11]:
import pandas as pd
import re
import unidecode
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle

### --- Carga y Exploración Inicial de Datos ---

In [12]:
data = pd.read_csv('../data/raw/playstore_reviews.csv')
print("Shape of the dataset:", data.shape)
print("Data info:", data.info())
print("Value counts of polarity:", data['polarity'].value_counts())
print("Number of duplicated rows:", data.duplicated().sum())

Shape of the dataset: (891, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB
Data info: None
Value counts of polarity: polarity
0    584
1    307
Name: count, dtype: int64
Number of duplicated rows: 0


### --- Limpieza de Datos ---

In [13]:
data_cleaned = data.drop('package_name', axis=1)

def clean_text(text):
    text = unidecode.unidecode(text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.strip().lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d{1,2}/\d{1,2}/\d{4}', 'fecha', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

data_cleaned['review'] = data_cleaned['review'].apply(clean_text)
data_cleaned

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,messenger issues ever since the last update in...,0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who dont...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well ads arent as annoying...,1


### --- División de Datos en Train y Test ---

In [14]:
X = data_cleaned['review']
y = data_cleaned['polarity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Shape of training set:", X_train.shape)

Shape of training set: (712,)


### --- Vectorización con CountVectorizer ---

In [17]:
vec_model_count = CountVectorizer(stop_words='english')
X_train_count = vec_model_count.fit_transform(X_train).toarray()
X_test_count = vec_model_count.transform(X_test).toarray()

# Definir las rutas antes de usarlas
PROCESSED_DATA_PATH = '../data/processed/'
VECTORIZER_PATH = '../data/processed/'

pd.DataFrame(X_train_count).to_excel(f"{PROCESSED_DATA_PATH}X_train.xlsx", index=False)
pd.DataFrame(X_test_count).to_excel(f"{PROCESSED_DATA_PATH}X_test.xlsx", index=False)
y_train.to_excel(f"{PROCESSED_DATA_PATH}y_train.xlsx", index=False)
y_test.to_excel(f"{PROCESSED_DATA_PATH}y_test.xlsx", index=False)

with open(f"{VECTORIZER_PATH}CountVectorizer.pkl", "wb") as file:
    pickle.dump(vec_model_count, file)

print("CountVectorizer model saved.")


CountVectorizer model saved.


### --- Vectorización con TfidfVectorizer ---

In [18]:
vec_model_tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = vec_model_tfidf.fit_transform(X_train).toarray()
X_test_tfidf = vec_model_tfidf.transform(X_test).toarray()

pd.DataFrame(X_train_tfidf).to_excel(f"{PROCESSED_DATA_PATH}X_train_tfidf.xlsx", index=False)
pd.DataFrame(X_test_tfidf).to_excel(f"{PROCESSED_DATA_PATH}X_test_tfidf.xlsx", index=False)

with open(f"{VECTORIZER_PATH}TfidfVectorizer.pkl", "wb") as file:
    pickle.dump(vec_model_tfidf, file)

print("TfidfVectorizer model saved.")

TfidfVectorizer model saved.
