In [31]:
import re
import string
import numpy as np
import pandas as pd

### 0. Read data


In [29]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
# Dataset's path
real_path = '/content/drive/MyDrive/Colab Notebooks/AI/gossipcop_real.csv'
fake_path = '/content/drive/MyDrive/Colab Notebooks/AI/gossipcop_fake.csv'

In [33]:
def load_data(real_path, fake_path):
    real_data = pd.read_csv(real_path)
    fake_data = pd.read_csv(fake_path)
    real_data['label'] = 1
    fake_data['label'] = 0
    data = pd.concat([real_data, fake_data]).reset_index(drop=True)
    return data

In [34]:
data = load_data(real_path, fake_path)

In [None]:
data.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,gossipcop-882573,https://www.brides.com/story/teen-mom-jenelle-...,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,912371411146149888\t912371528343408641\t912372...,1
1,gossipcop-875924,https://www.dailymail.co.uk/tvshowbiz/article-...,Kylie Jenner refusing to discuss Tyga on Life ...,901989917546426369\t901989992074969089\t901990...,1
2,gossipcop-894416,https://en.wikipedia.org/wiki/Quinn_Perkins,Quinn Perkins,931263637246881792\t931265332022579201\t931265...,1
3,gossipcop-857248,https://www.refinery29.com/en-us/2018/03/19192...,I Tried Kim Kardashian's Butt Workout & Am For...,868114761723936769\t868122567910936576\t868128...,1
4,gossipcop-884684,https://www.cnn.com/2017/10/04/entertainment/c...,Celine Dion donates concert proceeds to Vegas ...,915528047004209152\t915529285171122176\t915530...,1


### 1. Preprocess the text


In [35]:
def wordopt(text):
    # Convert the text to lowercase
    text = text.lower()

    # Remove text within square brackets
    text = re.sub('\[.*?\]', '', text)

    # Remove URLs (starting with http or https or www)
    text = re.sub('https?://\S+|www\.\S+', '', text)

    # Remove HTML tags (everything between < and >)
    text = re.sub('<.*?>+', '', text)

    return text

### 2. Analyse the feature

In [30]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import subjectivity
from nltk import ne_chunk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('subjectivity')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Unzipping corpora/subjectivity.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
from nltk.corpus import wordnet as wn
dic ={
    'feeling' : [
    "Happy",
    "Sad",
    "Angry",
    "Fearful",
    "Surprised",
    "Disgusted"
],
    'motion' : [
    "Run",
    "Walk",
    "Jump",
    "Move",
    "Stop",
    "Go",
    "Sprint",
    "Jog",
    "Dash",
    "Leap",
    "Hop",
    "Skip",
    "Crawl",
    "Slide",
    "Glide",
    "Swim",
    "Fly",
    "Climb",
    "Hike",
    "Stroll",
    "March",
    "Trot",
    "Gallop",
    "Race",
    "Saunter",
    "Wander",
    "Roam",
    "Trek",
    "Amble",
    "Shuffle",
    "Limp",
    "Slink",
    "Stride",
    "Tiptoe",
    "Vault",
    "Dive",
    "Creep",
    "Roll",
    "Maneuver",
    "Navigate"
],
    'assents' : [
    "Yes",
    "Agree",
    "Approve",
    "Consent",
    "Affirmative"
],
    'perception' : [
    "See",
    "Hear",
    "Touch",
    "Taste",
    "Smell"
]
}

# Build lexicon by synonyms
lexicon = {}
for key, value in dic.items():
  lexicon[key] = set()
  for word in value:
    synonyms = wn.synsets(word)
    for syn in synonyms:
      for s in syn.lemma_names():
        lexicon[key].add(s)
print(lexicon)

{'feeling': {'glad', 'sicken', 'horrendous', 'tempestuous', 'sad', 'trepid', 'surprise', 'churn_up', 'angry', 'revolt', 'cowardly', 'tired_of', 'felicitous', 'disgust', 'wild', 'pitiful', 'fearful', 'lamentable', 'sorry', 'fearsome', 'repel', 'timorous', 'awful', 'well-chosen', 'dreaded', 'happy', 'surprised', 'dread', 'terrible', 'frightening', 'furious', 'direful', 'dreadful', 'dire', 'gross_out', 'raging', 'disgusted', 'distressing', 'fed_up', 'frightful', 'sick', 'sick_of', 'storm', 'deplorable', 'nauseate', 'horrific'}, 'motion': {'vaporize', 'backwash', 'range', 'be_active', 'race', 'pelt_along', 'whorl', 'bounce', 'affect', 'perish', 'aviate', 'square_up', 'movement', 'simulated_military_operation', 'swan', 'shuffling', 'cost_increase', 'stroll', 'creeping', 'run_short', 'butt_against', 'break_off', 'lean', 'cheat_on', 'turn_over', 'fly_front', 'scare_away', 'hasten', 'hurdle', 'scarper', 'persist', 'exhibit', 'crack', 'belt_along', 'wind', 'prima_donna', 'playground_slide', 'wh

In [39]:
# Define sets for faster membership testing
# targeted_entities = {"PERSON", "ORGANIZATION"}
# regionality_entities = {"LOCATION", "FACILITY", "GPE"}
# temporal_entities = {"DATE", "TIME"}

# Function to load the NRC Emotion Lexicon
def load_nrc_lexicon(file_path):
  lexicon = {}
  with open(file_path, 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
      word, emotion, association = row
      if word not in lexicon:
        lexicon[word] = []
      if association == '1':
        lexicon[word].append(emotion)
  return lexicon

def count_emotions_in_text(words, lexicon):
  emotion_counts = {emotion: 0 for emotion in set(emotion for emotions in lexicon.values() for emotion in emotions)}

  for word in words:
    if word in lexicon:
      for emotion in lexicon[word]:
        emotion_counts[emotion] += 1

  return emotion_counts

nrc_lexicon = load_nrc_lexicon('/content/drive/MyDrive/Colab Notebooks/AI/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt')
sid = SentimentIntensityAnalyzer()

def extract_feature(text):
  res = ()
  clean_text = wordopt(text)

  # Tokenize the text into words and sentences once
  words = word_tokenize(text)
  clean_words = word_tokenize(clean_text)
  sentences = sent_tokenize(clean_text)

  # Calculate word frequency
  word_freq = FreqDist(clean_words)

  # Calculate average sentence length
  avg_sentence_length = sum(len(sent.split()) for sent in sentences) / len(sentences) if sentences else 0
  res += (avg_sentence_length,)
  # Sentiment Analysis
  # sentiment_scores = sid.polarity_scores(clean_text)
  # neg_sentiment = sentiment_scores['neg'] # negative score
  # # neu_sentiment = sentiment_scores['neu']
  # pos_sentiment = sentiment_scores['pos'] # positive score
  # # compound_sentiment = sentiment_scores['compound']

  # Named Entity Recognition
  # ner_tags = nltk.ne_chunk(nltk.pos_tag(words))

  # targeted = 0 # Count the number of people's and organization's name
  # regionality = 0 # Count the number of lacation and facility
  # tempotality = 0 # Count the number of date and time

  # for entity in ner_tags:
  #   if isinstance(entity, nltk.Tree):
  #     label = entity.label()
  #     if label in targeted_entities:
  #         targeted += 1
  #     elif label in regionality_entities:
  #         regionality += 1
  #     elif label in temporal_entities:
  #         tempotality += 1
  emotion_counts = count_emotions_in_text(clean_words,nrc_lexicon)
  for _, val in emotion_counts.items():
    res += (val,)
  # Count the number of exclamation marks
  exclamation_count = text.count('!')
  res += (exclamation_count,)
  # Count the number of question marks
  question_count = text.count('?')
  res += (question_count,)
  # Count the number of quotation marks
  quotation_count = text.count('"')
  res += (quotation_count,)

  return res

In [40]:
# it will cost about 2 min to deal with this data.
feature = data['title'].apply(extract_feature)

In [52]:
columns = [ "avg_sentence_length"]+[emotion for emotion in set(emotion for emotions in nrc_lexicon.values() for emotion in emotions)]+[ "exclamation_count", "question_count", "quotation_count"]
feature_with_label = pd.DataFrame(feature.tolist(), columns=columns)
feature_with_label = pd.concat([feature_with_label, data['label']], axis=1)
feature_with_label.head()

Unnamed: 0,avg_sentence_length,negative,disgust,anger,sadness,trust,anticipation,fear,surprise,joy,positive,exclamation_count,question_count,quotation_count,label
0,12.0,0,0,0,0,1,1,0,0,1,1,0,0,0,1
1,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,10.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,9.0,1,0,1,0,0,0,1,0,0,0,0,0,0,1


In [53]:
# Store feature to csv
store_path = '/content/drive/MyDrive/Colab Notebooks/AI/feature.csv'
feature_with_label.to_csv(store_path, index=False)

In [54]:
# Read feature from csv
feature_with_label = pd.read_csv(store_path)
# feature_with_label.head()