In [44]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset
from sklearn.model_selection import train_test_split
import pandas as pd

### Creación de los datasets de entrenamiento y testeo

In [45]:
df = pd.read_csv('../../data/clean/tesla_tweets_clean_random_labelled.csv')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Link       50 non-null     object
 1   Date       50 non-null     object
 2   Text       50 non-null     object
 3   Sentiment  50 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ KB
None


Unnamed: 0,Link,Date,Text,Sentiment
0,https://twitter.com/SallyFereday/status/159006...,11/8/2022 23:10,Today TSLA is selling at a week low Some may ...,0
1,https://twitter.com/MyBostonRealtor/status/151...,4/20/2022 6:18,teslamobile TmobileTeslaTuesdays contest just ...,1
2,https://twitter.com/Ev92Revolution/status/1544...,7/5/2022 14:17,thats funny You praise VW because its a germa...,1
3,https://twitter.com/LinnieL7/status/1532796385...,6/3/2022 23:18,Well see if the smartest man on the planet can...,0
4,https://twitter.com/TeslaradarB/status/1534895...,6/9/2022 18:19,Smooth EvaTheKfer nailed it by spotting a Tesl...,1


In [46]:
df.drop(['Link', 'Date'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       50 non-null     object
 1   Sentiment  50 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 928.0+ bytes


In [47]:
mask = df['Sentiment'] == 1

aux1 = df[mask].sample(frac=1, random_state=42).reset_index(drop=True)
aux2 = df[~mask].sample(frac=1, random_state=42).reset_index(drop=True)


train_df_true, test_df_true = train_test_split(aux1, test_size=0.2, random_state=42)
train_df_false, test_df_false = train_test_split(aux2, test_size=0.2, random_state=42)

train_df = pd.concat([train_df_true, train_df_false]).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat([test_df_true, test_df_false]).sample(frac=1, random_state=42).reset_index(drop=True)

train_df.rename(columns={'Sentiment': 'label', 'Text': 'text'}, inplace=True)
test_df.rename(columns={'Sentiment': 'label', 'Text': 'text'}, inplace=True)

train_df.to_csv('../../data/clean/tesla_tweets_clean_random_labelled_train.csv', index=False)
test_df.to_csv('../../data/clean/tesla_tweets_clean_random_labelled_test.csv', index=False)


In [48]:
dataset = load_dataset(
    'csv',
    data_files={
        'train': '../../data/clean/tesla_tweets_clean_random_labelled_train.csv',
        'test': '../../data/clean/tesla_tweets_clean_random_labelled_test.csv'
    },
    cache_dir='../../data/cache'
)

dataset

Downloading and preparing dataset csv/default to c:/Users/34644/Desktop/Cursos/Curso CEI/Trabajo final/data/cache/csv/default-f1a0eb82e5152487/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...



Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 997.69it/s]

Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 83.19it/s]

[A
[A
[A
[A
[A

Dataset csv downloaded and prepared to c:/Users/34644/Desktop/Cursos/Curso CEI/Trabajo final/data/cache/csv/default-f1a0eb82e5152487/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.



[A
100%|██████████| 2/2 [00:00<00:00, 16.95it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 39
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 11
    })
})

In [49]:
model = SetFitModel.from_pretrained(
    "sentence-transformers/all-mpnet-base-v2"
)


Downloading (…)lve/main/config.json: 100%|██████████| 571/571 [00:00<00:00, 566kB/s]

Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 

Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 10.4MB/s]

Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<?, ?B/s] 

Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<?, ?B/s] 

Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 525kB/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [05:33<00:00, 1.31MB/s]

Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 2

In [50]:
trainer = SetFitTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=18, 
    num_epochs=1
)

In [51]:
trainer.train()


[A
[A
Generating Training Pairs: 100%|██████████| 18/18 [00:00<00:00, 74.07it/s]
***** Running training *****
  Num examples = 1404
  Num epochs = 1
  Total optimization steps = 88
  Total train batch size = 16



[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

Iteration: 100%|██████████| 88/88 [23:18<00:00, 15.89s/it]

Epoch: 100%|██████████| 1/1 [23:18<00:00, 1398.34s/it]


In [52]:
metrics = trainer.evaluate()
metrics

***** Running evaluation *****


{'accuracy': 0.9090909090909091}

In [53]:
trainer.model.save_pretrained(save_directory='./output_models/')

In [57]:
preds = model([
    'Tesla stock is so down today',
    'Tesla stock is so up today'
    'I love tesla so much',
    'I hate tesla so much',
    'i crashed my tesla today',
    'elon musk just saved a cat from a fire',
    'i feel like elon musk is not doing enough for the environment',
])
preds

tensor([0, 1, 0, 0, 1, 0], dtype=torch.int32)