In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score as accuracy

from pathlib import Path

## Read the data

In [2]:
def load_data(set):
    return pd.read_json(set, lines=True)

def read_data(set):
    return pd.read_csv(set)

## Create X (inputs) and y (labels)

In [3]:
def create_x_y(df, set):
    if set == 'train':
        x = df['text']
        y = df['label']
        return x, y
    else:
        return df['text']

## Vectorize the text

In [4]:
def vectorize_text(x_train, x_test, vectorizer):
    x_train_vectorized = vectorizer.fit_transform(x_train)
    x_test_vectorized = vectorizer.transform(x_test)
    return x_train_vectorized, x_test_vectorized

## Classify the text

In [5]:
def classify(x_train_vectorized, y_train, x_test_vectorized):
    model = GradientBoostingClassifier()
    model.fit(x_train_vectorized, y_train)
    y_pred = model.predict(x_test_vectorized)
    return y_pred

## Create submission file

In [12]:
def create_submission_file(df_test, predictions):
    df = pd.DataFrame(columns=['id', 'label'])
    df['id'] = df_test['id']
    df['label'] = predictions
    df.to_csv('submission.csv', index=False)

## Print results

In [16]:
def print_classification_report(y_true, y_pred):
    """
    Função para imprimir o relatório de classificação.
    """
    report = classification_report(y_true, y_pred)
    print(report)

## Run

In [9]:
# List available files
input_dir = Path('C:/Users/jeanc/Desktop/Desafio - Rafael')
for p in input_dir.rglob('*.jsonl'):
    print(p)

C:\Users\jeanc\Desktop\Desafio - Rafael\train.jsonl
C:\Users\jeanc\Desktop\Desafio - Rafael\validation.jsonl


In [15]:
df_train = load_data(input_dir.joinpath('train.jsonl'))
x_train, y_train = create_x_y(df_train, 'train')
df_test = read_data(input_dir.joinpath('test.csv'))
x_test = create_x_y(df_test, 'test')
vectorizer = TfidfVectorizer()  # create a TfidfVectorizer object
x_train_vectorized, x_test_vectorized = vectorize_text(x_train, x_test, vectorizer)  # vectorize the text
predictions = classify(x_train_vectorized, y_train, x_test_vectorized)  # classify the text
create_submission_file(df_test, predictions)  # create the submission file

### Resultados

In [21]:
print("\nRelatório de Classificação (Conjunto de Teste):")
df_val = load_data(input_dir.joinpath('validation.jsonl'))
y_true = df_val['label']
for value in predictions:
    print(value)
print(accuracy(y_true, predictions))
# print_classification_report(y_true, predictions)




Relatório de Classificação (Conjunto de Teste):
1
0
0
1
0
1
0
0
0
0
0
0
1
1
0
0
1
0
1
1
1
1
1
0
1
0
0
0
1
0
0
0
1
1
0
1
1
1
0
0
0
1
1
1
1
0
1
1
1
1
1
1
1
1
0
0
0
1
1
0
0
1
0
1
1
0
0
0
1
0
1
0
1
0
0
0
1
1
0
1
1
0
0
0
0
0
0
1
1
0
1
0
1
0
0
1
1
0
1
0
0
1
1
1
1
0
1
1
1
1
0
1
1
1
0
0
1
1
0
0
0
0
0
1
1
0
0
0
0
1
0
1
0
1
0
1
1
0
0
0
0
0
1
0
0
0
1
0
1
1
0
1
0
1
1
1
0
0
1
1
0
0
0
1
1
1
1
0
0
1
1
0
1
0
0
0
1
1
0
0
1
0
0
0
0
1
0
0
0
1
0
0
0
1
1
1
0
0
1
0
0
0
1
1
1
1
0
1
0
1
1
1
0
1
0
0
0
0
1
1
0
0
0
0
1
0
0
1
0
0
1
0
1
1
0
0
1
1
0
0
1
1
0
1
0
0
1
1
0
1
0
0
0
1
1
0
0
1
1
1
0
0
1
1
0
1
1
1
0
0
0
0
1
1
0
1
1
1
0
0
1
0
0
1
1
1
0
1
0
1
1
0
0
1
0
1
1
0
0
1
1
1
0
1
0
1
1
1
1
1
0
0
0
1
0
1
0
1
1
0
1
0
1
0
1
1
1
0
1
1
1
1
1
0
0
0
0
1
0
0
0
1
1
0
1
0
0
1
0
0
1
0
0
1
1
1
1
0
1
1
1
0
0
0
0
0
1
0
1
1
0
1
0
0
1
1
0
0
1
0
1
1
0
1
0
1
1
0
1
0
1
0
0
0
0
0
0
1
1
1
0
0
0
0
1
0
0
0
1
0
0
1
1
1
0
0
1
0
0
1
1
0
1
1
1
0
0
0
1
0
1
0
0
1
1
1
0
1
1
1
1
1
0
0
0
1
1
1
1
1
0
1
0
0
1
0
0
1
1
0
0
0
1
1
1
1
0
0
0
0
0
0
0
1
0
1

ValueError: Found input variables with inconsistent numbers of samples: [570, 1140]