Disester Prediction Tweeter Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [3]:
def clean_text(text):
    replacements = {
        '\x89ÛÏ': '"',
        '\x89Û\x9d': '"',
        '\x89Ûª': "'",
        '\x89ÛÒ': '-',
        '\x89Û_': '',
        '\x89ÛÓ': '',
        '\x89Û¢': '',
        '\x89Ûª': '',
        '\x89Û÷': '',
        '\x89âÂ': '',

        '&gt;': '>',
        '&lt;': '<',
        '&amp;': '&',

        '\n': ' ',
    }

    for original, replacement in replacements.items():
        text = text.replace(original, replacement)

    return text

def replace_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'[URL]', text)

def jaccard_similarity(s1, s2):
    set1 = set(s1.split())
    set2 = set(s2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)


In [4]:
def remove_duplicates(data):
  similarity_threshold = 0.75
  duplicates = set()
  
  # It will run for 5 minutes or so
  for i in range(len(data)):
      if i in duplicates:
        continue
  
      for j in range(i + 1, len(data)):
        if j in duplicates:
          continue
        similarity = jaccard_similarity(data.loc[i, 'text'], data.loc[j, 'text'])
        if similarity > similarity_threshold:
          duplicates.add(j)
  
  return data.drop(duplicates).reset_index(drop=True)

In [5]:
def fix_keyword_inplace(data):
  data['keyword'] = data['keyword'].apply(lambda x: x.replace('%20', ' ') if pd.notna(x) and isinstance(x, str) else x)

def extract_url_feature_inplace(data):
  data['text'] = data['text'].apply(clean_text).apply(replace_urls)
  data["has_url"] = data['text'].apply(lambda text: '[URL]' in text)
  data['text'] = data['text'].apply(lambda x: x.replace('[URL]', ''))
  for i in range(10):
    data['text'] = data['text'].apply(lambda x: x.replace('  ', ' '))

def aggregate_location_inplace(data):
  mapping_dict = {
      'new york, ny': 'new york',
      'united states': 'usa',
      'nyc': 'new york',
      'london, uk': 'london',
      'london, england': 'london',
      'us': 'usa',
      'ny': 'new york',
      'earth': 'planet earth',
      'california, usa': 'california',
      'los angeles, ca': 'los angeles',
      'washington, dc': 'washington dc',
      'world': 'planet earth',
      'united kingdom': 'uk',
      'global': 'planet earth',
      'new york city': 'new york',
      'new york, usa': 'new york',
      'worldwide': 'planet earth',
      'hackney, london': 'london',
      'england': 'uk',
  }

  data['location'] = data['location'].str.lower().replace(mapping_dict)
  
  location_counts = data['location'].value_counts()
  singleton_values = location_counts[location_counts == 1].index.tolist()
  
  data['location'].replace(singleton_values, '[something]', inplace=True)

In [6]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

for data in [train_df, test_df]:
  extract_url_feature_inplace(data)
  fix_keyword_inplace(data)

train_df = remove_duplicates(train_df)

for data in [train_df, test_df]:
  aggregate_location_inplace(data)

KeyboardInterrupt: 

In [8]:
train_df.shape

(6704, 6)

In [9]:
train_df = train_df.rename(columns={'target': 'labels'})
test_df = test_df.rename(columns={'target': 'labels'})

In [10]:
train_df.to_csv('train_cleaned.csv', index=False)
test_df.to_csv('test_cleaned.csv', index=False)

In [7]:
train = pd.read_csv("train_cleaned.csv")
test = pd.read_csv("test_cleaned.csv")

In [32]:
train["text"] = train["text"].astype(str)
train.info()
train.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6704 entries, 0 to 6703
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6704 non-null   int64 
 1   keyword   6654 non-null   object
 2   location  4509 non-null   object
 3   text      6704 non-null   object
 4   labels    6704 non-null   int64 
 5   has_url   6704 non-null   bool  
dtypes: bool(1), int64(2), object(3)
memory usage: 268.5+ KB


(6704, 6)

In [8]:
X=list(train['text'])
y = list(train["labels"])

In [33]:
X

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Forest fire near La Ronge Sask. Canada',
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in California ',
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ',
 '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires',
 '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas',
 "I'm on top of the hill and I can see a fire in the woods...",
 "There's an emergency evacuation happening now in the building across the street",
 "I'm afraid that the tornado is coming to our area...",
 'Three people died from the heat wave so far',
 'Haha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA WHAT AM I GONNA DO WHAT AM I GONNA 

In [9]:
y

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,


In [10]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [19]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [12]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [13]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [17]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

# training_args = TFTrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=2,              # total number of training epochs
#     per_device_train_batch_size=8,  # batch size per device during training
#     per_device_eval_batch_size=16,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     logging_steps=10,
# )

In [42]:
# with training_args.strategy.scope():
#     model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# trainer = TFTrainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=test_dataset             # evaluation dataset
# )

# trainer.train()

In [20]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [45]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=test_dataset.shuffle(100).batch(16))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x193f7fdd8d0>

In [47]:
# model.evaluate(test_dataset)

In [59]:
model.save_pretrained(r"C:\Users\macy1\OneDrive\Documents\MachineLearning\Projects\Classification\DisesterPrediction\Model")

In [60]:
model.save_pretrained("/tmp/disesterPrediction_custom_model")

In [53]:
# model.predict(test_dataset)

In [52]:
# model.predict(test_dataset)[0].shape

In [None]:
# output=model.predict(test_dataset)[1]

In [2]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("/tmp/disesterPrediction_custom_model")

NameError: name 'TFDistilBertForSequenceClassification' is not defined

In [112]:
test_sentence = "life is good"

predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
tf_output = loaded_model.predict(predict_input)[0]



In [114]:
tf_prediction = tf.nn.sigmoid(tf_output).numpy()[0]

In [115]:
print(tf_prediction[1])

0.099682085


### Creating submission file

In [141]:
# Creting prediction.csv file
submission = pd.read_csv("test_cleaned.csv")
X_sub=list(submission['text'])
submission.shape

(3263, 5)

In [99]:
sub_encodings = tokenizer(X_sub, truncation=True, padding=True)

In [100]:
sub_dataset = tf.data.Dataset.from_tensor_slices((
    dict(sub_encodings)
))

In [122]:
sub_output = model.predict(sub_dataset)[0]



In [121]:
sub_prediction = tf.nn.sigmoid(tf_output).numpy()[0][1]
# if sub_prediction > 0.5:
#     print(1)
# else:
#     print(0)
sub_prediction

0.099682085

In [137]:
tf_prediction = tf.nn.sigmoid(sub_output).numpy()[:,1]

In [139]:
print(tf_prediction)
tf_prediction.shape

[0.7340498  0.9496094  0.8865386  ... 0.9532825  0.5582137  0.86128664]


(3263,)

In [143]:
target = []

for i in tf_prediction:
    if i > 0.5:
        target.append(1)
    else:
        target.append(0)

target

[1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,


In [148]:

id = submission["id"]
submission.head()

Unnamed: 0,id,keyword,location,text,has_url
0,0,,,Just happened a terrible car crash,False
1,2,,,"Heard about #earthquake is different cities, stay safe everyone.",False
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",False
3,9,,,Apocalypse lighting. #Spokane #wildfires,False
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,False


In [151]:
df2 = pd.DataFrame({"id":id,
                    "target": target})

df2

'id,target\r\n0,1\r\n2,1\r\n3,1\r\n9,1\r\n11,1\r\n12,1\r\n21,0\r\n22,0\r\n27,0\r\n29,0\r\n30,0\r\n35,0\r\n42,0\r\n43,0\r\n45,0\r\n46,1\r\n47,0\r\n51,1\r\n58,0\r\n60,0\r\n69,1\r\n70,1\r\n72,0\r\n75,1\r\n84,0\r\n87,0\r\n88,0\r\n90,0\r\n94,0\r\n99,1\r\n101,0\r\n103,0\r\n106,0\r\n108,0\r\n111,1\r\n115,0\r\n116,0\r\n122,0\r\n123,0\r\n124,1\r\n125,0\r\n127,1\r\n140,0\r\n142,1\r\n147,0\r\n148,0\r\n150,0\r\n152,0\r\n154,1\r\n155,0\r\n166,0\r\n167,0\r\n169,1\r\n177,0\r\n179,0\r\n181,0\r\n186,0\r\n188,0\r\n189,0\r\n192,0\r\n200,1\r\n202,1\r\n206,1\r\n207,1\r\n214,1\r\n217,1\r\n223,0\r\n224,1\r\n227,1\r\n228,1\r\n230,1\r\n233,1\r\n234,1\r\n236,1\r\n239,1\r\n250,1\r\n255,0\r\n257,0\r\n259,0\r\n275,1\r\n278,0\r\n282,0\r\n284,0\r\n286,0\r\n288,1\r\n292,1\r\n295,0\r\n300,1\r\n304,1\r\n305,1\r\n306,0\r\n308,0\r\n311,0\r\n317,0\r\n319,0\r\n323,0\r\n324,0\r\n325,1\r\n326,1\r\n333,0\r\n339,0\r\n342,0\r\n343,0\r\n350,0\r\n351,1\r\n357,0\r\n359,0\r\n362,0\r\n366,0\r\n367,0\r\n369,0\r\n373,0\r\n374,1\r\n376

In [152]:
compression_opts = dict(method='zip',
                        archive_name='out.csv')  
df2.to_csv('out.zip', index=False,
          compression=compression_opts)