In [37]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from dateutil.relativedelta import relativedelta
from datetime import datetime

# Certifique-se de ter os recursos do NLTK baixados
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lolop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lolop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lolop\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
def clean_text(text, stop_words, stemmer=None, lemmatizer=None):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    if stemmer:
        words = [stemmer.stem(word) for word in words]
    elif lemmatizer:
        words = [lemmatizer.lemmatize(word) for word in words]
    clean_text = ' '.join(words)
    return clean_text


In [41]:
def clean_csv(input_file, output_file, column_to_clean, use_stemming=True):
    df = pd.read_csv(input_file)
    stop_words = set(stopwords.words('portuguese'))
    stemmer = PorterStemmer() if use_stemming else None
    lemmatizer = WordNetLemmatizer() if not use_stemming else None
    df[column_to_clean] = df[column_to_clean].apply(lambda x: clean_text(str(x), stop_words, stemmer, lemmatizer))
    df.to_csv(output_file, index=False)


In [43]:
def add_fraud_column(input_file, output_file):
    df = pd.read_csv(input_file)
    # Adiciona uma coluna 'fraude' com base nas estrelas (supondo que 1 e 5 estrelas são suspeitas de fraude)
    df['fraude'] = df['estrelas'].apply(lambda x: 1 if x in [1, 5] else 0)
    df.to_csv(output_file, index=False)


In [45]:
def convert_relative_time_to_days(time_str):
    match = re.match(r'(\d+)\s*(dias|semanas|meses|anos)\s*atrás', time_str.lower())
    
    if match:
        num = int(match.group(1))
        unit = match.group(2)
        
        current_date = datetime.now()
        
        if unit == 'dias':
            past_date = current_date - relativedelta(days=num)
        elif unit == 'semanas':
            past_date = current_date - relativedelta(weeks=num)
        elif unit == 'meses':
            past_date = current_date - relativedelta(months=num)
        elif unit == 'anos':
            past_date = current_date - relativedelta(years=num)
        else:
            return 0
        
        return (current_date - past_date).days
    
    return 0


In [47]:
def create_new_features(df):
    df['tempo'] = df['tempo'].apply(convert_relative_time_to_days)
    
    df['avaliacoes_classificacoes_ratio'] = df['avaliacoes'] / (df['classificacoes'] + 1)
    df['fotos_por_avaliacao'] = df['fotos'] / (df['avaliacoes'] + 1)
    df['intervalo_avaliacoes'] = df['tempo'].diff().fillna(0)  # Diferença de dias entre avaliações
    df['interacoes_ativas'] = df['p/r'] / (df['avaliacoes'] + 1)
    df['lugares_por_avaliacao'] = df['lugares adicionados'] / (df['avaliacoes'] + 1)
    df['local_guide_peso'] = df['local guide'] * df['avaliacoes']
    
    return df


In [49]:
def tokenize_and_vectorize(input_file, output_file, text_column, use_tfidf=True, max_features=1000):
    df = pd.read_csv(input_file)
    df = create_new_features(df)
    if use_tfidf:
        vectorizer = TfidfVectorizer(max_features=max_features)
    else:
        vectorizer = CountVectorizer(max_features=max_features)
    vectors = vectorizer.fit_transform(df[text_column].astype(str))
    vectors_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())
    df = pd.concat([df, vectors_df], axis=1)
    df.to_csv(output_file, index=False)


In [51]:
def prepare_data_for_model(input_file, text_column, label_column, max_features=1000):
    df = pd.read_csv(input_file)
    vectorizer = TfidfVectorizer(max_features=max_features)
    X_text = vectorizer.fit_transform(df[text_column].astype(str))
    X = pd.DataFrame(X_text.toarray())
    y = df[label_column]
    return X, y


In [62]:
def balance_data_with_smote(X, y):
    smote = SMOTE(random_state=42, k_neighbors=1)  # Ajustando n_neighbors para 1
    X_res, y_res = smote.fit_resample(X, y)
    return X_res, y_res


In [55]:
def train_and_evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    model_rf.fit(X_train, y_train)
    y_pred = model_rf.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return model_rf


In [70]:
input_file = 'entrada3.csv'  # Substitua pelo caminho do arquivo de entrada
output_file_clean = 'saida.csv'  # Substitua pelo caminho do arquivo limpo
output_file_fraud = 'token.csv'  # Arquivo com a coluna de fraude e tokenização
output_file_token = 'token_completo.csv'  # Arquivo final com todas as features e tokenização

# 1. Limpar o texto
clean_csv(input_file, output_file_clean, column_to_clean='avaliacao', use_stemming=True)

# 2. Adicionar a coluna de fraude
add_fraud_column(output_file_clean, output_file_fraud)

# 3. Tokenizar e vetorializar (mantendo todas as colunas do arquivo original + tokenização)
tokenize_and_vectorize(output_file_fraud, output_file_token, text_column='avaliacao', use_tfidf=True, max_features=1000)

# 4. Preparar os dados para o modelo
X, y = prepare_data_for_model(output_file_token, text_column='avaliacao', label_column='fraude')

# 5. Balancear os dados com SMOTE
X_res, y_res = balance_data_with_smote(X, y)

# 6. Treinar e avaliar o modelo
modelo = train_and_evaluate_model(X_res, y_res)


Accuracy: 0.7419354838709677
              precision    recall  f1-score   support

           0       0.77      0.67      0.71        15
           1       0.72      0.81      0.76        16

    accuracy                           0.74        31
   macro avg       0.75      0.74      0.74        31
weighted avg       0.74      0.74      0.74        31

