In [2]:
# Packages for data processing and machine learning
import numpy as np, pandas as pd, sklearn as sk, torch as th 
# Packages for webpage crawling
import requests as r
from bs4 import BeautifulSoup as BS
# Packages for nature language processing
import spacy, pyinflect
from pyinflect import getInflection, getAllInflections, getAllInflectionsOOV
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from collections import Counter
from nltk.corpus import stopwords as sw
from nltk.util import ngrams as ng
from nltk.tokenize import word_tokenize as tk
from nltk.stem import WordNetLemmatizer as wn
# Packages for Twitter API and configuration
import tweepy as tw, configparser  
# Packages about time
import time as t, datetime as dt, rfc3339
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ky002\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ky002\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ky002\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ky002\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Reqest the webpages including the synonymsous words or meanings of these topics: car, buy, and drive
car_url='https://www.thesaurus.com/browse/car'
buy_url='https://www.thesaurus.com/browse/buy'
buying_url='https://www.thesaurus.com/browse/buying'
drive_url='https://www.thesaurus.com/browse/drive'

# Crawling the synonyms of four topics: (car, buy, buying, drive) from Thesurus.com
def keyword_extract(url_name,class_name):
    page_name=r.get(url_name)
    soup_name=BS(page_name.content,'html.parser')
    key_soup=soup_name.find('ul', class_=class_name).find_all('a')
    list_name=[]
    for key in key_soup:
        new_key=key['href'][8:].replace('%20',' ')
        list_name.append(new_key)
    return list_name

# I only captured the red-marked words which contain the most closed meanings as the chosen topics
car_sym=keyword_extract(car_url,'css-1xohnkh e1ccqdb60')[:14]
car_sym.insert(0,'car')
buy_sym=keyword_extract(buy_url,'css-wmtunb e1ccqdb60')[:4]
buy_sym.insert(0,'buy')
buying_sym=keyword_extract(buying_url,'css-1lj4erq e1ccqdb60')[:3]
buying_sym.insert(0,'buying')
buy_sym=buy_sym+buying_sym
drive_sym=keyword_extract(drive_url,'css-n85ndd e1ccqdb60')[:4]
drive_sym.insert(0,'drive')

# Altough I have got thirty words as required keywords for tweet requests, it is recommended to include different tenses or forms of words 
#, as many Twitter users will use them depending on different contents.
nlp = spacy.load('en_core_web_sm')

buy_str=' '.join(buy_sym)
buy_doc=nlp(buy_str)
buy_extension_list=[]
for num in range(len(buy_doc)):
    token = buy_doc[num]
    if token.tag_ in ['NN','VB','VBG']:
        buy_extension_list.append(token._.inflect('VB',inflect_oov=True))
buy_extension_list=[ele for ele in list(set(buy_extension_list+['invest','shop','transact'])) if ele]

def extension(sym):
    sym_str=' '.join(sym)
    sym_token=nlp(sym_str)
    extension_list=[]
    for num in range(len(sym_token)):
        token = sym_token[num]
        if token.tag_ in ['NN','VB','VBG']:
            if str(token)!=token._.inflect('VBD',inflect_oov=True)[:len(token)]:
                extension_list.append(token._.inflect('VBD',inflect_oov=True))              
            if str(token)!=token._.inflect('VBG',inflect_oov=True)[:len(token)]:
                 extension_list.append(token._.inflect('VBG',inflect_oov=True))
            if str(token)!=token._.inflect('VBN',inflect_oov=True)[:len(token)]:
                 extension_list.append(token._.inflect('VBN',inflect_oov=True))
            if str(token)!=token._.inflect('VBZ',inflect_oov=True)[:len(token)]:
                 extension_list.append(token._.inflect('VBZ',inflect_oov=True))
    return extension_list

buy_sym=list(set(extension(buy_extension_list)+buy_extension_list))
buy_sym.append('acquisition')

drive_sym=list(set(extension(drive_sym)+drive_sym))

# According to car registration records of the UK from 2019 to 2022, the top 10 sales brands were selected as they might be mentioned more frequently than the other brands when talking about automative topics
brand_list=['Ford','BMW','Volkswagen','Mercedes-Benz','Audi','Vauxhall','Toyota','Kia','Hyundai','Land Rover']

#While searching for Google Trends data, various spellings or expressions that a user could use while posting about a car model were checked.
brand_abb=['VW','Mercedes','Voho','Landy','Bimmer','MBZ']

# The most popular Car sale Models in the UK, as a same replacement of the brand name: One model for one brand
brand_model=['fiesta','corolla','Series','polo','sportage','tucson','corsa','A-Class','discovery','A3']

# Name of car types when people are talking a series of car
car_type=['coupe','hatchback','sedan','sports','suv']

# Names of many important parts of car are also added in as I found that is a part of comments when people value their cars:15 most important words from https://www.collinsdictionary.com/word-lists/car-parts-of-a-car
car_tool=['grip','bumper','tyre','brake','bonnet','airbag','carburettor','piston','engine','battery','fuel tank','hood','steering wheel','accelerator','seatbelt']

#But there isn't such words like automobiling/automobiled or jeeping/jeeped, so it has to be deleted
car_sym.remove('ride')
car_sym.append('vehicle')

def df_generator(name,list):
    return pd.DataFrame({name:list})
#Put all the words into a dataframe sorted by different segements.
keywords=pd.concat([df_generator('Car',car_sym),
                    df_generator('Buy',buy_sym),
                    df_generator('Drive',drive_sym),
                    df_generator('Brand',brand_list),
                    df_generator('Brand Abbreviation',brand_abb),
                    df_generator('Brand Model',brand_model), 
                    df_generator('Car Tool',car_tool),
                    df_generator('Car Type',car_type)],
                    axis=1)
print(keywords)

# To limit the search zone that has to contain the elements of car, purchase, drive 
keylist_of_car=car_sym+brand_list+brand_abb+brand_model+car_tool+car_type
keylist_of_buy=buy_sym
keylist_of_drive=drive_sym

#Build the content of query: element 'car' is compulsory which the other elements are optional
query_content='('+' '.join(keylist_of_car).replace(' ',' OR ')+') ('+' '.join(keylist_of_buy+keylist_of_drive).replace(' ',' OR ')+') lang:en place_country:GB -is:nullcast -has:links'
print(len(query_content), query_content)
# Build and read the config for password safety
config=configparser.RawConfigParser()
config.read('config.ini')

api_key=config['twitter']['api_key']
api_key_secret=config['twitter']['api_key_secret']
access_token=config['twitter']['access_token']
access_token_secret=config['twitter']['access_token_secret']
bearer_token=config['twitter']['bearer_token']

# Authtication of my Twitter api
client = tw.Client(bearer_token,api_key, api_key_secret,access_token, access_token_secret,wait_on_rate_limit=True)

# Building the timestrap of each hour
def date_range(start_date, end_date):
    while start_date <= end_date:
        yield start_date
        start_date+=dt.timedelta(hours=8)

# Set the starting at the first hour of May 1st 2019 and the end time at the same hour of May 1st 2022
start_date = datetime(2019, 5, 1, 0, 00,00)
end_date = datetime(2022, 3, 1, 0, 00,00)
# remember to change about the time
first_time=[]
second_time=[]
for single_date in date_range(start_date, end_date):
    first_time.append(single_date.strftime("%Y-%m-%d %H:%M:%S"))
    second_time.append(single_date.strftime("%Y-%m-%d %H:%M:%S"))

first_time=first_time[:-1]
second_time=second_time[1:]

def rfc_time_convetor(time_list):
    new_time_list=[]
    for single_record in time_list:
        datetime_object = datetime.strptime(single_record, "%Y-%m-%d %H:%M:%S")
        rfc_records=rfc3339.rfc3339(datetime_object)
        new_time_list.append(rfc_records)
    return new_time_list

rfc_first_time=rfc_time_convetor(first_time)
rfc_second_time=rfc_time_convetor(second_time)

# Retrieve all the required data from Twitter API from each eight hours per day of three years
for start_time, end_time in zip(rfc_first_time,rfc_second_time):
    tweet_info_small_list=[]
    paginator=tw.Paginator(client.search_all_tweets,
                            query_content,                            
                            end_time=end_time,       
                            start_time=start_time,
                            tweet_fields = ["created_at", "text", "lang"],
                            sort_order=['relevancy'],
                            max_results=100).flatten(limit=250)
    for tweet in paginator:
        tweet_info_small_list.append(tweet.data)
    tweets_datasource = pd.DataFrame(tweet_info_small_list)
    tweets_datasource.to_csv('C:/Users/ky002/Desktop/Dickens/Postgraduate/Dissertation/Data Source/twitter_full_version_data.csv',sep=',', mode='a',encoding='utf_8')

              Car          Buy    Drive          Brand Brand Abbreviation  \
0             car       invest     rode           Ford                 VW   
1            auto   purchasing   ridden            BMW           Mercedes   
2      automobile     transact   riding     Volkswagen               Voho   
3             bus     purchase  driving  Mercedes-Benz              Landy   
4     convertible    acquiring      run           Audi             Bimmer   
5            jeep      acquire     trip       Vauxhall                MBZ   
6       limousine          buy     tour         Toyota                NaN   
7         machine      bargain    drive            Kia                NaN   
8           motor         shop     ride        Hyundai                NaN   
9          pickup  acquisition    drove     Land Rover                NaN   
10  station wagon          NaN      NaN            NaN                NaN   
11          truck          NaN      NaN            NaN                NaN   

In [2]:
#Get rid of the empty lines and 
raw_data=pd.read_csv('TwData_100.csv',sep=',',header=0,encoding='utf_8')
raw_data=raw_data.drop(columns=['Unnamed: 0','Unnamed: 5'])
raw_data=raw_data[raw_data['id']!='id'].sort_values(by='created_at').reset_index(drop=True)

# A forbidden word dictionary for meaningless words
exclude_words=sw.words('english')
print(exclude_words)
exclude_words=exclude_words+['@','|','/','\'']

NameError: name 'pd' is not defined

In [8]:
pd.set_option('display.max_colwidth', None)


107471