In [1]:
import pandas as ps
from IPython.display import Image
import os
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid") #White Grid
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID= "ProgettoFifa19"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

============

## Terms of use

Please carefully read the "Terms of use.txt" file to understand our terms of use of this data.

==================

## Meta Information:

- We used paid workers from the Crowdflower crowdsourcing platform for labeling
- At least three different workers were required to agree on a label before a task is finalized
- We ran two tasks (as described below) for labeling
- Please cite the below given paper, if you use any of these resources in your research

==================================

## Task # 1 Categories---Description:

1. Personal Only---if a message is only of interest to its author and her immediate circle of family/friends and does not convey any useful information to other people who do not know the author.
2. Informative (Direct)--- if the message is of interest to other people beyond the author's immediate circle, and seems to be written by a person who is a direct eyewitness of what is taking place.
3. Informative (Indirect)--- if the message is of interest to other people beyond the author's immediate circle, and seems to be seen/heard by the person on the radio, TV, newspaper, or other source. The message must specify the source.
4. Informative (Direct or Indirect)--- if the message is of interest to other people beyond the author's immediate circle, but there is not enough information to tell if it is a direct report or a repetition of something from another source.
5. Other--- if the message is not in English, or if it cannot be classified.


==================================

## Task # 2 Categories---Description:

1. Caution and advice---if a message conveys/reports information about some warning or a piece of advice about a possible hazard of an incident.
2. Casualties and damage---if a message reports the information about casualties or damage done by an incident.
3. Donations of money, goods or services---if a message speaks about money raised, donation offers, goods/services offered or asked by the victims of an incident.
4. People missing, found, or seen---if a message reports about the missing or found person effected by an incident or seen a celebrity visit on ground zero.
5. Information source---if a message conveys/contains some information sources like photo, footage, video, or mentions other sources like TV, radio related to an incident.

=====================================================================================

## Please cite the following paper, if you use any of these resources in your research

Muhammad Imran, Shady Elbassuoni, Carlos Castillo, Fernando Diaz and Patrick Meier. Practical Extraction of Disaster-Relevant Information from Social Media. In Social Web for Disaster Management (SWDM'13) - Co-located with WWW, May 2013, Rio de Janeiro, Brazil. 



# Dataset 1
01_personal-informative-other/a143145.csv

In [2]:
dataset = ps.read_csv("SWDM2013_dataset/sandy2012_labeled_data/01_personal-informative-other/a143145.csv")
dataset.head()


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,nil,text_no_rt,tweet,user
0,221934923,True,golden,3,11/13/2012 02:06:24,Personal Only,1.0,Personal Only,,......................Ã¤Ã³Ã¬@Brains_x_Beauty: ...,RT @TheHolyKaron: ......................Ã¤Ã³Ã¬...,____DatPMF
1,221934924,True,golden,2,11/13/2012 02:16:03,Informative (Indirect),0.515,Informative (Indirect),,At least 39 dead millions without power in San...,At least 39 dead millions without power in San...,_3091004140882
2,221934925,True,golden,2,11/13/2012 02:06:50,Personal Only,1.0,Personal Only,,Clearly no one wants to look good for sandy la...,Clearly no one wants to look good for sandy la...,_JordanDawn_
3,221934926,True,golden,1,11/11/2012 01:44:50,Other,1.0,Informative (Indirect),,Hurricane #Sandy hits the Statue Of Liberty ht...,RT @ImJay_wbu: Hurricane #Sandy hits the Statu...,_k4typerry
4,221934927,False,finalized,3,11/10/2012 12:38:04,Informative (Indirect),1.0,,,Watch Hurricane #Sandy prowling the East Coast...,RT @NASA: Watch Hurricane #Sandy prowling the ...,_KieranJD_


In [3]:
dataset.shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               1000 non-null   int64  
 1   _golden                1000 non-null   bool   
 2   _unit_state            1000 non-null   object 
 3   _trusted_judgments     1000 non-null   int64  
 4   _last_judgment_at      1000 non-null   object 
 5   choose_one             1000 non-null   object 
 6   choose_one:confidence  1000 non-null   float64
 7   choose_one_gold        51 non-null     object 
 8   nil                    1 non-null      object 
 9   text_no_rt             1000 non-null   object 
 10  tweet                  1000 non-null   object 
 11  user                   1000 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 87.0+ KB


In [4]:
DS=dataset.drop('nil',axis=1)
DS

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,text_no_rt,tweet,user
0,221934923,True,golden,3,11/13/2012 02:06:24,Personal Only,1.0000,Personal Only,......................Ã¤Ã³Ã¬@Brains_x_Beauty: ...,RT @TheHolyKaron: ......................Ã¤Ã³Ã¬...,____DatPMF
1,221934924,True,golden,2,11/13/2012 02:16:03,Informative (Indirect),0.5150,Informative (Indirect),At least 39 dead millions without power in San...,At least 39 dead millions without power in San...,_3091004140882
2,221934925,True,golden,2,11/13/2012 02:06:50,Personal Only,1.0000,Personal Only,Clearly no one wants to look good for sandy la...,Clearly no one wants to look good for sandy la...,_JordanDawn_
3,221934926,True,golden,1,11/11/2012 01:44:50,Other,1.0000,Informative (Indirect),Hurricane #Sandy hits the Statue Of Liberty ht...,RT @ImJay_wbu: Hurricane #Sandy hits the Statu...,_k4typerry
4,221934927,False,finalized,3,11/10/2012 12:38:04,Informative (Indirect),1.0000,,Watch Hurricane #Sandy prowling the East Coast...,RT @NASA: Watch Hurricane #Sandy prowling the ...,_KieranJD_
...,...,...,...,...,...,...,...,...,...,...,...
995,221941935,False,finalized,3,11/10/2012 23:43:00,Informative (Indirect),1.0000,,LATEST | Airports and stock exchange reopen; N...,LATEST | Airports and stock exchange reopen; N...,TheSunNews
996,221941936,False,finalized,4,11/10/2012 22:29:50,Personal Only,0.7401,,I'm sorry for all those who are suffering due ...,I'm sorry for all those who are suffering due ...,garfield_kathy
997,221941937,False,finalized,4,11/11/2012 06:41:18,Informative (Direct or Indirect),0.7597,,looks like sandy has hit stourport!,looks like sandy has hit stourport!,kathsandford
998,221941938,False,finalized,3,11/12/2012 21:26:10,Informative (Indirect),0.6521,,The Voice : Hurricane Sandy Caused by Global W...,The Voice : Hurricane Sandy Caused by Global W...,MainStMonroe


# Dataset 2
02_informative_caution-infosrc-donation-damage-other/a144267.csv

In [5]:
dataset = ps.read_csv("SWDM2013_dataset/sandy2012_labeled_data/02_informative_caution-infosrc-donation-damage-other/a144267.csv")
dataset.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,text_no_rt,tweet,type,user
0,223607030,False,finalized,3,11/14/2012 03:26:51,Casualties and damage,1.0,,28 deaths due to Hurricane Sandy | Charlotte t...,RT @HenderGrande: 28 deaths due to Hurricane S...,Informative (Direct or Indirect),1EmmaHenderson
1,223607031,True,golden,3,11/14/2012 03:41:27,Casualties and damage,1.0,Casualties and damage,Sandy kills 50 people nationwide 18 were New Y...,Sandy kills 50 people nationwide 18 were New Y...,Informative (Direct or Indirect),9NEWS
2,223607032,False,finalized,3,11/14/2012 03:38:40,Casualties and damage,0.6849,,RT @nowthised: Wider shot of scaffolding toppl...,RT @twc_hurricane: RT @nowthised: Wider shot o...,Informative (Direct or Indirect),AbrahamAkiva
3,223607033,True,golden,10,11/14/2012 03:35:39,Caution and advice,0.89,Caution and advice,#Sandy will be onshore about 6pm. Slightly fas...,RT @growingwisdom: #Sandy will be onshore abou...,Informative (Direct or Indirect),aecheylon
4,223607034,True,golden,7,11/14/2012 03:36:35,"Donations of money, goods or services",0.5727,"Donations of money, goods or services",@ChrisFRC lost startups can go to @AlleyNYC. T...,@ChrisFRC lost startups can go to @AlleyNYC. T...,Informative (Direct or Indirect),AHolidayiii


In [6]:
dataset.shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543 entries, 0 to 542
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               543 non-null    int64  
 1   _golden                543 non-null    bool   
 2   _unit_state            543 non-null    object 
 3   _trusted_judgments     543 non-null    int64  
 4   _last_judgment_at      543 non-null    object 
 5   choose_one             543 non-null    object 
 6   choose_one:confidence  543 non-null    float64
 7   choose_one_gold        41 non-null     object 
 8   text_no_rt             543 non-null    object 
 9   tweet                  543 non-null    object 
 10  type                   543 non-null    object 
 11  user                   543 non-null    object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 47.3+ KB


# Dataset 3
03_caution-n-advice_classify-extract/a146283.csv

In [7]:
dataset = ps.read_csv("SWDM2013_dataset/sandy2012_labeled_data/03_caution-n-advice_classify-extract/a146283.csv")
dataset.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,source,type_of_advice_or_caution,type_of_advice_or_caution:confidence,what,when,...,category,source_gold,text_no_rt,tweet,type,type_of_advice_or_caution_gold,user,what_gold,when_gold,where_gold
0,227175857,True,golden,8,12/8/2012 19:09:26,#sandy\nN/A\nN/A\nn/a\nN/A\n\nN/A\nn/a,A hurricane sighting has been reported,0.7629,Storm just now hitting Jersey\nAll bridges clo...,now\nnow\njust now\nn/a\nN/A\nnow\nnow\njust now,...,Informative (Direct or Indirect),,Storm just now hitting Jersey...Milford is fee...,Storm just now hitting Jersey...Milford is fee...,Caution and advice,A hurricane sighting has been reported,beball,hitting Jersey\nAll bridges closing in NYC & J...,now,NYC\nJersey\nJersey\nNYC & Jersey
1,227175859,False,golden,1,11/25/2012 17:31:06,,Other,1.0,Hurricane Sandy is going to fill the subways w...,,...,Informative (Direct or Indirect),,Hurricane Sandy is going to fill the subways w...,RT @JessieWessie33: Hurricane Sandy is going t...,Caution and advice,Other,britthanus,fill the subways with water,,NYC
2,227175860,True,golden,34,12/10/2012 06:05:12,\nn/a\n\n#Sandy\nn/a\nn/a\n#Sandy\n#Sandy\n\n\...,A hurricane sighting has been reported,0.7427,NYC is now impacted by hurricane\nimpacted\nhu...,now\nnow\n\nnow\nn/a\nn/a\nN/A\nnow\n\nnow\nno...,...,Informative (Direct or Indirect),,Omg NYC is now impacted by hurricane #Sandy,Omg NYC is now impacted by hurricane #Sandy,Caution and advice,A hurricane sighting has been reported,BuniYani,Impacted\nNYC is now impacted by hurricane\nim...,Now,NYC\nNYC
3,227175862,True,golden,31,12/10/2012 06:08:53,n/a\n\n#Sandy\n#Sandy\nN/A\n\n\n\n\nN/A\n\nN/A...,Other,0.9531,stock market closing\nstock market closing for...,n/a\n\nna\nN/A\nN/A\n\n1988\n\n\nN/A\n\nN/A\ns...,...,Informative (Direct or Indirect),,First time since 1888 stock market closing for...,First time since 1888 stock market closing for...,Caution and advice,Other,feverplay_xo,stock market closing\nstock market closing for...,N/A\nn/a,N/A\nn/a
4,227175867,False,golden,1,11/25/2012 17:24:06,,A hurricane sighting has been reported,1.0,Hurricane Sandy,,...,Informative (Direct or Indirect),,Hurricane Sandy or else Frankestorm heading to...,RT @KateNinaA: Hurricane Sandy or else Frankes...,Caution and advice,A hurricane sighting has been reported\nOther,LuciiAgudo,Hurricane Sandy or else Frankestorm,,heading toward NYC


In [8]:
dataset.shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   _unit_id                              126 non-null    int64  
 1   _golden                               126 non-null    bool   
 2   _unit_state                           126 non-null    object 
 3   _trusted_judgments                    126 non-null    int64  
 4   _last_judgment_at                     126 non-null    object 
 5   source                                122 non-null    object 
 6   type_of_advice_or_caution             126 non-null    object 
 7   type_of_advice_or_caution:confidence  126 non-null    float64
 8   what                                  126 non-null    object 
 9   when                                  123 non-null    object 
 10  where                                 126 non-null    object 
 11  category           

# Dataset 3
03_damage-n-casualties_classify-extract/a146281.csv

In [9]:
dataset = ps.read_csv("SWDM2013_dataset/sandy2012_labeled_data/03_damage-n-casualties_classify-extract/a146281.csv")
dataset.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,hom_many_injured_or_dead_if_people,people_or_infrastructure,people_or_infrastructure:confidence,what_infrastructure_was_damaged_if_infrastructure,category,hom_many_injured_or_dead_if_people_gold,people_or_infrastructure_gold,text_no_rt,tweet,type,user,what_infrastructure_was_damaged_if_infrastructure_gold
0,227175113,True,golden,12,11/23/2012 14:36:01,28\n28\n28\n28\n28\n28\n28\n28\n28\n28\n28\n28,People: dead,1.0,\n\n\n\n\n\n\n\n\n\n\n,Informative (Direct or Indirect),28,People: dead,28 deaths due to Hurricane Sandy | Charlotte t...,RT @HenderGrande: 28 deaths due to Hurricane S...,Casualties and damage,1EmmaHenderson,
1,227175114,True,golden,12,11/23/2012 13:13:34,50\n50\n50\n50 people nationwide 18 were New Y...,Both people and infrastructure,0.7143,homes\n\n8 million homes\n8 million homes stil...,Informative (Direct or Indirect),50\nkills 50 people nationwide\n50 people nati...,People: dead\nBoth people and infrastructure,Sandy kills 50 people nationwide 18 were New Y...,Sandy kills 50 people nationwide 18 were New Y...,Casualties and damage,9NEWS,8 million homes still without power\n8 millio...
2,227175115,True,golden,7,11/23/2012 09:09:19,\n\n\n\n\n\n,"Infrastructure (building, bridge, road, etc.) ...",0.5353,\nscaffolding toppling car\nshot of scaffoldin...,Informative (Direct or Indirect),,"Infrastructure (building, bridge, road, etc.) ...",RT @nowthised: Wider shot of scaffolding toppl...,RT @twc_hurricane: RT @nowthised: Wider shot o...,Casualties and damage,AbrahamAkiva,scaffolding toppling car\nshot of scaffolding ...
3,227175116,True,golden,11,11/23/2012 13:20:51,\n\n\n\nnone\n\ntoll climbs\n\n\n\ntoll climbs,Both people and infrastructure,0.5651,"homes\npower, homes\nThousands of homes\nhomes...",Informative (Direct or Indirect),N/A\nN/a,Both people and infrastructure,Sandy death toll climbs; millions remain witho...,Sandy death toll climbs; millions remain witho...,Casualties and damage,alexandermimi,millions remain without power = electricity da...
4,227175117,True,golden,15,11/23/2012 14:25:53,\n\n\n\n\n\n\n\n\n\n\n\n\n\n,"Infrastructure (building, bridge, road, etc.) ...",1.0,Extensive flooding in all subway tunnels\nExte...,Informative (Direct or Indirect),,"Infrastructure (building, bridge, road, etc.) ...",Extensive flooding in all subway tunnels says ...,RT @emmagkeller: Extensive flooding in all sub...,Casualties and damage,alischaw,all subway tunnels\nExtensive flooding in all ...


In [10]:
dataset.shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 17 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   _unit_id                                                170 non-null    int64  
 1   _golden                                                 170 non-null    bool   
 2   _unit_state                                             170 non-null    object 
 3   _trusted_judgments                                      170 non-null    int64  
 4   _last_judgment_at                                       170 non-null    object 
 5   hom_many_injured_or_dead_if_people                      170 non-null    object 
 6   people_or_infrastructure                                170 non-null    object 
 7   people_or_infrastructure:confidence                     170 non-null    float64
 8   what_infrastructure_was_damaged_if_infra

# Dataset 3
03_infosrc_classify-extract/a146274.csv

In [11]:
dataset = ps.read_csv("SWDM2013_dataset/sandy2012_labeled_data/03_infosrc_classify-extract/a146274.csv")
dataset.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,type_of_message,type_of_message:confidence,url_or_name_of_the_stationchannel,what,category,text_no_rt,tweet,type,type_of_message_gold,url_or_name_of_the_stationchannel_gold,user,what_gold
0,227174029,True,golden,7,11/25/2012 07:05:36,None of the above,0.8817,N/A\nn/a\nN/A\nN/A\nNA\nN/A\nN/A,sandy was a weak ass bitch\nsandy was a weak a...,Informative (Direct or Indirect),checked the news. yes there is schol tomoro. s...,checked the news. yes there is schol tomoro. s...,Information Source,None of the above,,BrandoCummando,
1,227174030,True,golden,5,11/25/2012 07:13:17,Look at this photo or these photos,0.4002,http://t.co/j6I531we\nhttp://t.co/j6I531we\nN/...,Hurricane #Sandy hits NYC hard with intense wi...,Informative (Direct or Indirect),Hurricane #Sandy hits NYC hard with intense wi...,Hurricane #Sandy hits NYC hard with intense wi...,Information Source,Look at this photo or these photos\nNone of th...,http://t.co/j6I531we,davidfig79,
2,227174031,True,golden,6,11/24/2012 04:04:24,None of the above,0.6951,n/a\nN/A\nN/A\nN/A\nN/A\nN/A,press conference LIVE now on #Sandy aftermath....,Informative (Direct or Indirect),Mayor Bloomberg holding press conference LIVE ...,RT @HuffingtonPost: Mayor Bloomberg holding pr...,Information Source,None of the above,,HurricSandyNews,
3,227174032,True,golden,1,11/24/2012 22:17:12,Look at this video or these videos,1.0,http://t.co/eofIJ7Fe,Statue of Liberty,Informative (Direct or Indirect),RT @mrspantsworth: Statue of Liberty Torchcam ...,RT @dirtymondaze: RT @mrspantsworth: Statue of...,Information Source,Look at this web site/page,http://t.co/eofIJ7Fe,loloster,some idea of the force of #sandy
4,227174033,True,golden,2,11/23/2012 02:49:17,Look at this web site/page,0.5455,http://t.co/eCGqw1ek\nhttp://t.co/eCGqw1ek,images from the empty #NYC subways\nfrom the e...,Informative (Direct),even the rats left town RT @YGNw: Eery images ...,even the rats left town RT @YGNw: Eery images ...,Information Source,Look at this photo or these photos,http://t.co/eCGqw1ek,dc_fusion,Eery images from the empty #NYC subways


In [12]:
dataset.shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   _unit_id                                70 non-null     int64  
 1   _golden                                 70 non-null     bool   
 2   _unit_state                             70 non-null     object 
 3   _trusted_judgments                      70 non-null     int64  
 4   _last_judgment_at                       70 non-null     object 
 5   type_of_message                         70 non-null     object 
 6   type_of_message:confidence              70 non-null     float64
 7   url_or_name_of_the_stationchannel       70 non-null     object 
 8   what                                    70 non-null     object 
 9   category                                70 non-null     object 
 10  text_no_rt                              70 non-null     object 


# Dataset 4
04_combined_classification/a154774.csv

In [13]:
dataset = ps.read_csv("SWDM2013_dataset/sandy2012_labeled_data/04_combined_classification/a154774.csv")
dataset.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event:confidence,type_of_message,type_of_message:confidence,nil,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event_gold,tweet,tweet_no,tweet_no_rt,type_of_message_gold,user
0,238841781,False,finalized,4,1/2/2013 13:37:11,,,"Informative: offers/gives donations of money, ...",0.2689,,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler
1,238841782,False,finalized,4,12/24/2012 14:05:56,True,1.0,Not informative: personal only,0.7772,,,@ChrisMara816: Screw #sandy we skipped right a...,116293,@ChrisMara816: Screw #sandy we skipped right a...,,kaatteexo
2,238841783,False,finalized,4,12/24/2012 14:05:56,True,1.0,Informative: information source with extensive...,0.2554,,,On The Learning Network Sandy as a Teaching To...,1091,On The Learning Network Sandy as a Teaching To...,,LotsToLearn
3,238841784,True,golden,35,1/2/2013 13:30:06,True,1.0,"Informative: damage (building, road, lines, etc.)",0.6938,,,RT @nytimes: More than 8 million homes in the ...,53537,More than 8 million homes in the U.S. are now ...,"Informative: damage (building, road, lines, etc.)",brunamacedo06
4,238841785,True,golden,28,1/2/2013 13:33:57,True,1.0,"Can not judge (not in English, too short, etc.)",0.5661,,,These Hurricane Sandy pages though,113571,These Hurricane Sandy pages though,"Can not judge (not in English, too short, etc.)",CameronXCV


In [14]:
dataset.shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 16 columns):
 #   Column                                                                      Non-Null Count  Dtype  
---  ------                                                                      --------------  -----  
 0   _unit_id                                                                    1987 non-null   int64  
 1   _golden                                                                     1987 non-null   bool   
 2   _unit_state                                                                 1987 non-null   object 
 3   _trusted_judgments                                                          1987 non-null   int64  
 4   _last_judgment_at                                                           1987 non-null   object 
 5   the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event             375 non-null    object 
 6   the_author_of_the_tweet_seems_to_be_an_eye_witne

# Dataset 4
04_combined_classification/f154774.csv

In [15]:
dataset = ps.read_csv("SWDM2013_dataset/sandy2012_labeled_data/04_combined_classification/f154774.csv")
dataset.head()

Unnamed: 0,_unit_id,_created_at,_golden,_id,_missed,_started_at,_tainted,_channel,_trust,_worker_id,...,_ip,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event,type_of_message,nil,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event_gold,tweet,tweet_no,tweet_no_rt,type_of_message_gold,user
0,238841781,12/24/2012 13:41:45,False,787060207,,12/24/2012 13:37:36,False,instagc,0.7333,14425455,...,69.136.129.135,,Informative: other,,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler
1,238841781,12/24/2012 13:44:17,False,787063467,,12/24/2012 13:39:45,False,instagc,0.8333,13441146,...,98.18.108.46,,"Informative: offers/gives donations of money, ...",,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler
2,238841781,12/24/2012 13:54:14,False,787076220,,12/24/2012 13:51:24,False,golddiggergpt,0.8182,11092052,...,108.92.226.94,,Informative: celebrities or authorities react ...,,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler
3,238841781,1/2/2013 13:37:11,False,797105603,,1/2/2013 13:34:10,False,instagc,0.7143,13166748,...,71.200.11.183,,Informative: caution or advice,,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler
4,238841782,12/24/2012 13:24:41,False,787038536,,12/24/2012 13:22:21,False,instagc,0.9048,14031300,...,70.247.113.51,True,Not informative: personal only,,,@ChrisMara816: Screw #sandy we skipped right a...,116293,@ChrisMara816: Screw #sandy we skipped right a...,,kaatteexo


In [16]:
dataset.shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7115 entries, 0 to 7114
Data columns (total 23 columns):
 #   Column                                                                Non-Null Count  Dtype  
---  ------                                                                --------------  -----  
 0   _unit_id                                                              7115 non-null   int64  
 1   _created_at                                                           7115 non-null   object 
 2   _golden                                                               7115 non-null   bool   
 3   _id                                                                   7115 non-null   int64  
 4   _missed                                                               290 non-null    object 
 5   _started_at                                                           7115 non-null   object 
 6   _tainted                                                              7115 non-null   bool   
 7

# Dataset 5
05_information_extraction/f157060.csv

In [17]:
dataset = ps.read_csv("SWDM2013_dataset/sandy2012_labeled_data/05_information_extraction/f157060.csv")
dataset.head()

Unnamed: 0,_unit_id,_created_at,_golden,_id,_missed,_started_at,_tainted,_channel,_trust,_worker_id,...,_city,_ip,word_or_shortphrase,instruction,tweet,tweet_no,tweet_no_rt,type_of_message,type_of_messageconfidence,word_or_shortphrase_gold
0,240632763,1/2/2013 15:03:51,True,797143942,False,1/2/2013 15:01:51,False,instagc,0.9167,13441146,...,Cleveland,98.18.123.70,homes in the U.S. are now without power post-s...,"names a structure, road, service, line, etc. t...",RT @nytimes: More than 8 million homes in the ...,53537,More than 8 million homes in the U.S. are now ...,"Informative: damage (building, road, lines, etc.)",0.6938,power\nwithout power\n8 million homes in the U...
1,240632763,1/2/2013 15:06:13,True,797145153,False,1/2/2013 15:03:16,False,instagc,0.875,14893322,...,Lynchburg,96.235.212.161,8 million homes in the U.S. are now without power,"names a structure, road, service, line, etc. t...",RT @nytimes: More than 8 million homes in the ...,53537,More than 8 million homes in the U.S. are now ...,"Informative: damage (building, road, lines, etc.)",0.6938,power\nwithout power\n8 million homes in the U...
2,240632763,1/2/2013 15:09:30,True,797146945,False,1/2/2013 15:06:41,False,instagc,1.0,12960437,...,Point Pleasant,173.81.178.140,8 million homes in the U.S. are now without power,"names a structure, road, service, line, etc. t...",RT @nytimes: More than 8 million homes in the ...,53537,More than 8 million homes in the U.S. are now ...,"Informative: damage (building, road, lines, etc.)",0.6938,power\nwithout power\n8 million homes in the U...
3,240632763,1/2/2013 15:20:43,True,797152214,,1/2/2013 15:17:00,False,instagc,0.75,14475081,...,Dunlap,98.143.6.78,8 million homes in the U.S. are now without power,"names a structure, road, service, line, etc. t...",RT @nytimes: More than 8 million homes in the ...,53537,More than 8 million homes in the U.S. are now ...,"Informative: damage (building, road, lines, etc.)",0.6938,power\nwithout power\n8 million homes in the U...
4,240632763,1/2/2013 15:32:29,True,797157904,,1/2/2013 15:30:17,False,instagc,1.0,14049868,...,Toledo,131.183.0.122,without power,"names a structure, road, service, line, etc. t...",RT @nytimes: More than 8 million homes in the ...,53537,More than 8 million homes in the U.S. are now ...,"Informative: damage (building, road, lines, etc.)",0.6938,power\nwithout power\n8 million homes in the U...


In [18]:
dataset.shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2847 entries, 0 to 2846
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   _unit_id                   2847 non-null   int64  
 1   _created_at                2847 non-null   object 
 2   _golden                    2847 non-null   bool   
 3   _id                        2847 non-null   int64  
 4   _missed                    51 non-null     object 
 5   _started_at                2847 non-null   object 
 6   _tainted                   2847 non-null   bool   
 7   _channel                   2847 non-null   object 
 8   _trust                     2847 non-null   float64
 9   _worker_id                 2847 non-null   int64  
 10  _country                   2847 non-null   object 
 11  _region                    2847 non-null   object 
 12  _city                      2847 non-null   object 
 13  _ip                        2847 non-null   objec

In [29]:
DS=dataset.drop({'_unit_id','_country','_missed','_started_at','_tainted','_channel','_golden','_id','_worker_id','word_or_shortphrase','tweet','tweet_no','type_of_messageconfidence','word_or_shortphrase_gold','_created_at','_trust','_region','_city','_ip'},axis=1)
DS.to_csv('instruction.csv')
