# Loading the data and preprocessing the tweets


In [43]:
import pandas as pd
path = "/submission.csv"
data = pd.read_csv(path)
# data = data.iloc[700:800]
data = data.reset_index()
# data = data[data['keyword'].notna() & data['location'].notna()]
print(data.shape)
print(data.head(5))


(62, 4)
   index                                               text  target  preds
0    700                                  anxiety attack ??       0      0
1    701  seen on fahlo wcw all hail the queen ?? URL mt...       0      0
2    702          each time we try we always end up sinking       0      0
3    703                 confirmed the debris from mh370 ??       1      1
4    704  demolish deep space etoffe charmeuse clothesle...       0      0


In [44]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_tweet(tweet):
    # # 1. Convert to lowercase
    # tweet = tweet.lower()

    # 2. Tokenization
    tokens = nltk.word_tokenize(tweet)

    # # 3. Remove special characters and punctuation
    # tokens = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens]

    # 4. Remove URLs
    tokens = [token for token in tokens if not token.startswith('http')]

    # 5. Remove user mentions
    tokens = [token for token in tokens if not token.startswith('@')]

    # 6. Remove hashtags or convert to plain text
    tokens = [token[1:] if token.startswith('#') else token for token in tokens]

    # 6. Remove hashtags or convert to plain text
    tokens = [token[1:] if token.startswith('?') else token for token in tokens]

    # # 7. Remove numbers
    # tokens = [token for token in tokens if not token.isdigit()]

    # # 8. Remove stop words
    # stop_words = set(stopwords.words('english'))
    # tokens = [token for token in tokens if token not in stop_words]

    # 9. Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]


    tweet = ' '.join(tokens)

    return tweet

# Example usage
tweet = "California fires: Evacuation orders given to 13,000"
processed_tweet = preprocess_tweet(tweet)
print(processed_tweet)


California fire : Evacuation order given to 13,000


[nltk_data] Downloading package punkt to
[nltk_data]     /home/george.ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/george.ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/george.ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
data['processed_tweet'] = data['text'].apply(preprocess_tweet)
print(data.iloc[3].text)
print(data.iloc[3].processed_tweet)
print(data)

confirmed the debris from mh370 ??
confirmed the debris from mh370  
    index                                               text  target  preds  \
0     700                                  anxiety attack ??       0      0   
1     701  seen on fahlo wcw all hail the queen ?? URL mt...       0      0   
2     702          each time we try we always end up sinking       0      0   
3     703                 confirmed the debris from mh370 ??       1      1   
4     704  demolish deep space etoffe charmeuse clothesle...       0      0   
..    ...                                                ...     ...    ...   
57    757  i was on my way to gary but all the chicago en...       1      1   
58    758  the thing with rules is break it once it becom...       0      0   
59    759  turkish troops killed in kurdish militant 'sui...       1      1   
60    760  watch this airport get swallowed up by a sands...       1      1   
61    761                 only weapon im scared off is karma  

# Web Scraping


In [46]:
from googleapiclient.discovery import build


api_key = ""
cse_id = ""

def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs , num=1).execute()
    return res['items']



In [47]:
import requests
from bs4 import BeautifulSoup

def scrape_and_extract_text(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Extracting the title
        title_tag = soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else "No Title Found"

        # Extracting text content only from <p> tags
        text_content = [element.get_text(strip=True) for element in soup.find_all('p')]

        # Join all text-containing elements to form the body text
        body_text = ' '.join(text_content)

        return title, body_text

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [48]:
# Function to scrape and extract data for a single processed_tweet
from timeout_decorator import timeout

@timeout(15)
def scrape_and_extract_data(processed_tweet, api_key, cse_id):
    try:
        results = google_search(processed_tweet, api_key, cse_id)
        documents = []
        headers = []
        sources = []
        if "genius.com" not in results[0]['link'] and "caneidhelp.miami.edu/" not in results[0]['link'] and "github" not in results[0]['link'] and "kaggle" not in results[0]['link'] and "huggingface" not in results[0]['link'] and "notebook.community" not in results[0]['link']:
            title, text = scrape_and_extract_text(results[0]['link'])
            if text:
                headers.append(results[0]['title'])
                documents.append(text)
                sources.append(results[0]['link'])
    except Exception as e:
        print(f"Error processing tweet: {processed_tweet}. Error: {e}")
        headers, documents, sources = None, None, None  # Or suitable defaults

    return headers, documents, sources

# Create a list of processed_tweet values from your DataFrame
processed_tweets = data['processed_tweet'].tolist()

# Sequentially process each tweet
for index, tweet in enumerate(processed_tweets):
    try:
        headers, documents, sources = scrape_and_extract_data(tweet, api_key, cse_id)
        # Update the DataFrame for the current row
        # data.at[index, 'headers'] = headers
        # data.at[index, 'documents'] = documents
        # data.at[index, 'source'] = sources'
        print(index)
        data.at[index, 'headers'] = ', '.join(headers) if headers else None
        data.at[index, 'documents'] = ', '.join(documents) if documents else None
        data.at[index, 'source'] = ', '.join(sources) if sources else None

    except Exception as e:
        print(f"Error processing tweet at index {index}: {e}")

    progress = ((index + 1) / len(processed_tweets)) * 100
    print(f"Progress: {index + 1}/{len(processed_tweets)} tasks completed ({progress:.2f}%)")


0
Progress: 1/62 tasks completed (1.61%)
1
Progress: 2/62 tasks completed (3.23%)
2
Progress: 3/62 tasks completed (4.84%)
3
Progress: 4/62 tasks completed (6.45%)
4
Progress: 5/62 tasks completed (8.06%)
5
Progress: 6/62 tasks completed (9.68%)
6
Progress: 7/62 tasks completed (11.29%)
7
Progress: 8/62 tasks completed (12.90%)
8
Progress: 9/62 tasks completed (14.52%)
9
Progress: 10/62 tasks completed (16.13%)
10
Progress: 11/62 tasks completed (17.74%)
An error occurred: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error processing tweet: and last year it wa just a lot of 'the drum are flooding ' and 'janice i 'm falling '. Error: cannot unpack non-iterable NoneType object
11
Progress: 12/62 tasks completed (19.35%)
12
Progress: 13/62 tasks completed (20.97%)
13
Progress: 14/62 tasks completed (22.58%)
14
Progress: 15/62 tasks completed (24.19%)
15
Progress: 16/62 tasks completed (25.81%)
16
Progress: 17/62 tasks completed (27.42%)
17
P

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


54
Progress: 55/62 tasks completed (88.71%)
55
Progress: 56/62 tasks completed (90.32%)
56
Progress: 57/62 tasks completed (91.94%)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


57
Progress: 58/62 tasks completed (93.55%)
58
Progress: 59/62 tasks completed (95.16%)
59
Progress: 60/62 tasks completed (96.77%)
Error processing tweet: watch this airport get swallowed up by a sandstorm in under a minute URL. Error: <HttpError 429 when requesting https://customsearch.googleapis.com/customsearch/v1?q=watch+this+airport+get+swallowed+up+by+a+sandstorm+in+under+a+minute+URL&cx=4607c26a6b7a44b5d&num=1&key=AIzaSyCyjquU5ziZUEzHs68aYggblRTbEs0Qck8&alt=json returned "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:773597908397'.". Details: "[{'message': "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:773597908397'.", 'domain': 'global', 'reason': 'rateLimitExceeded'}]">
60
Progress: 61/62 tasks completed (98.39%)
61
Progress: 62/62 tasks completed (100.00%)


In [49]:
data.head(10)

Unnamed: 0,index,text,target,preds,processed_tweet,headers,documents,source
0,700,anxiety attack ??,0,0,anxiety attack,"Anxiety attack: Symptoms, causes, and complica...",Anxiety can occur when a person fears that som...,https://www.medicalnewstoday.com/articles/307863
1,701,seen on fahlo wcw all hail the queen ?? URL mt...,0,0,seen on fahlo wcw all hail the queen URL mtv...,,,
2,702,each time we try we always end up sinking,0,0,each time we try we always end up sinking,Quality of Office 365 support is sinking.. - M...,‎Apr 21 201810:54 AM ‎Apr 21 201810:54 AM Just...,https://techcommunity.microsoft.com/t5/microso...
3,703,confirmed the debris from mh370 ??,1,1,confirmed the debris from mh370,MH370: Here's what's been found from jetliner ...,Three years after Malaysia Airlines Flight 370...,https://www.cnn.com/2017/03/08/asia/mh370-debr...
4,704,demolish deep space etoffe charmeuse clothesle...,0,0,demolish deep space etoffe charmeuse clothesle...,,,
5,705,is this the creepiest youth camp ever? URL,0,0,is this the creepiest youth camp ever URL,"[Serious] Campers of reddit, what is the scari...",r/AskReddit is the place to ask and answer tho...,https://www.reddit.com/r/AskReddit/comments/8j...
6,706,wreckage 'conclusively confirmed' as from mh37...,1,1,wreckage 'conclusively confirmed ' a from mh37...,Missing Malaysia plane MH370: What we know - B...,"The missing Malaysia Airlines plane, flight MH...",https://www.bbc.com/news/world-asia-26503141
7,707,reddit will now quarantine offensive content r...,0,0,reddit will now quarantine offensive content r...,COVID denialism and policy clarifications : r/...,/r/redditsecurity is a running log of actions ...,https://www.reddit.com/r/redditsecurity/commen...
8,708,robpulsenews huyovoetripolye phillips should b...,1,1,robpulsenews huyovoetripolye phillips should b...,,,
9,709,goulburn man henry van bilsen missing emergenc...,1,1,goulburn man henry van bilsen missing emergenc...,,,


In [50]:
data = data[data['headers'].notna() & data['documents'].notna()]

data

Unnamed: 0,index,text,target,preds,processed_tweet,headers,documents,source
0,700,anxiety attack ??,0,0,anxiety attack,"Anxiety attack: Symptoms, causes, and complica...",Anxiety can occur when a person fears that som...,https://www.medicalnewstoday.com/articles/307863
2,702,each time we try we always end up sinking,0,0,each time we try we always end up sinking,Quality of Office 365 support is sinking.. - M...,‎Apr 21 201810:54 AM ‎Apr 21 201810:54 AM Just...,https://techcommunity.microsoft.com/t5/microso...
3,703,confirmed the debris from mh370 ??,1,1,confirmed the debris from mh370,MH370: Here's what's been found from jetliner ...,Three years after Malaysia Airlines Flight 370...,https://www.cnn.com/2017/03/08/asia/mh370-debr...
5,705,is this the creepiest youth camp ever? URL,0,0,is this the creepiest youth camp ever URL,"[Serious] Campers of reddit, what is the scari...",r/AskReddit is the place to ask and answer tho...,https://www.reddit.com/r/AskReddit/comments/8j...
6,706,wreckage 'conclusively confirmed' as from mh37...,1,1,wreckage 'conclusively confirmed ' a from mh37...,Missing Malaysia plane MH370: What we know - B...,"The missing Malaysia Airlines plane, flight MH...",https://www.bbc.com/news/world-asia-26503141
7,707,reddit will now quarantine offensive content r...,0,0,reddit will now quarantine offensive content r...,COVID denialism and policy clarifications : r/...,/r/redditsecurity is a running log of actions ...,https://www.reddit.com/r/redditsecurity/commen...
10,710,broke my nail(real not fake) this morning bloo...,0,0,broke my nail ( real not fake ) this morning b...,my best employee quit on the spot because I wo...,"A reader writes: I manage a team, and part of ...",https://www.askamanager.org/2016/07/my-best-em...
12,712,deadpool is already one of my favourite marvel...,0,0,deadpool is already one of my favourite marvel...,Captain America: Civil War - Twenty Sided,Despite the big “Whose Side Are You On?” marke...,https://www.shamusyoung.com/twentysidedtale/?p...
13,713,u s record hurricane drought URL,1,1,u s record hurricane drought URL,Record drought gripped much of the U.S. in 202...,An official website of the United States gover...,https://www.noaa.gov/news/record-drought-gripp...
14,714,in every bts song jimin screams,0,0,in every bts song jimin scream,What are some chants that fans yell out togeth...,Something went wrong. Wait a moment and try ag...,https://www.quora.com/What-are-some-chants-tha...


In [51]:
data.shape

(37, 8)

In [52]:
# Specify the file path where you want to save the CSV file

file_path = "100Test.xlsx"

# Save the DataFrame as a CSV file
data.to_excel(file_path, index=False)  # Using backslash as an escape character

print(f"DataFrame saved as '{file_path}'")

DataFrame saved as '/home/george.ibrahim/Downloads/AI701/Project/Output_of_submission6.xlsx'
