In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# move directory to task 3 folder
os.chdir('..')

In [3]:
os.getcwd()

'C:\\Users\\bwils\\OneDrive\\Documents\\GitHub\\WeoWater\\task-3-wrangling-preprocessing'

In [4]:
# read in merged data with only english posts
all_data = pd.read_csv(r'All_merged/All_consolidated_and_little_processed.csv')

In [5]:
# read in merged data with only english posts
IG = pd.read_csv(r'Scraped Instagram Data - Merged\insta_merged2021_6_28.csv')

In [6]:
all_data.columns

Index(['Id', 'title', 'time', 'keywords', 'data source', 'category', 'country',
       'source URL', 'body', 'language'],
      dtype='object')

In [7]:
# Extract Instagram posts only
ig_data = all_data.loc[all_data['data source']=='Instagram']

In [8]:
ig_data.to_csv('IG_english_only.csv')

In [9]:
ig_data.columns

Index(['Id', 'title', 'time', 'keywords', 'data source', 'category', 'country',
       'source URL', 'body', 'language'],
      dtype='object')

In [10]:
ig_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1554 entries, 4221 to 5774
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Id           1554 non-null   object
 1   title        0 non-null      object
 2   time         1533 non-null   object
 3   keywords     0 non-null      object
 4   data source  1554 non-null   object
 5   category     1554 non-null   object
 6   country      0 non-null      object
 7   source URL   1554 non-null   object
 8   body         1554 non-null   object
 9   language     1554 non-null   object
dtypes: object(10)
memory usage: 133.5+ KB


In [11]:
drop_columns = ['title', 'time', 'keywords', 'data source', 'country', 'source URL', 'language', 'Id']
ig_data_lean = ig_data.drop(columns=drop_columns)

In [12]:
ig_data_lean

Unnamed: 0,category,body
4221,none,“Absence the highest form of presence”. #absen...
4222,none,Hearing aid design has come a very long way. W...
4223,none,Happy #PrideMonth from BAHA Irish Whiskey. Be ...
4224,none,Figuring out these crazy hands 😆\n\n#babiesofi...
4225,none,I am THRILLED to share these pictures with you...
...,...,...
5770,none,this pile is giving an impression how high wat...
5771,flood,Long weekend vibes \nAs you can see we had lot...
5772,none,On a bench at high watermark - learning in pro...
5773,flood,#letzebuerg #luxembourg #luxembourgcity #diffe...


In [13]:
ig_data_lean.reset_index(drop=True, inplace=True)
ig_data_lean

Unnamed: 0,category,body
0,none,“Absence the highest form of presence”. #absen...
1,none,Hearing aid design has come a very long way. W...
2,none,Happy #PrideMonth from BAHA Irish Whiskey. Be ...
3,none,Figuring out these crazy hands 😆\n\n#babiesofi...
4,none,I am THRILLED to share these pictures with you...
...,...,...
1549,none,this pile is giving an impression how high wat...
1550,flood,Long weekend vibes \nAs you can see we had lot...
1551,none,On a bench at high watermark - learning in pro...
1552,flood,#letzebuerg #luxembourg #luxembourgcity #diffe...


In [14]:
ig_flood = ig_data_lean.loc[ig_data_lean['category']=='flood']

ig_none = ig_data_lean.loc[ig_data_lean['category']=='none']


In [15]:
ig_flood.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376 entries, 102 to 1552
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  376 non-null    object
 1   body      376 non-null    object
dtypes: object(2)
memory usage: 8.8+ KB


In [16]:
ig_none.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1073 entries, 0 to 1553
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  1073 non-null   object
 1   body      1073 non-null   object
dtypes: object(2)
memory usage: 25.1+ KB


In [17]:
train_test = pd.concat([ig_flood, ig_none.sample(n=376, replace=False, random_state=42)])
train_test.sort_index()

Unnamed: 0,category,body
2,none,Happy #PrideMonth from BAHA Irish Whiskey. Be ...
3,none,Figuring out these crazy hands 😆\n\n#babiesofi...
5,none,Did you know that people with Down syndrome te...
9,none,The Battle Pass is back in the game! Colossal ...
10,none,Part 2: Blocks going up \n\n“Floridian Model” ...
...,...,...
1546,none,About water and paper.\n\n[EN] The combination...
1548,none,Turnhout. Venice at the river Aa. #rain #flood...
1550,flood,Long weekend vibes \nAs you can see we had lot...
1551,none,On a bench at high watermark - learning in pro...


In [18]:
np.random.seed(42)
test_size = int(.25*len(train_test))
test_indices = np.random.choice(train_test.index, replace=False, size=test_size)

In [19]:
train_X_ig = train_test.loc[~train_test.index.isin(test_indices), 'body']
test_X_ig = train_test.loc[train_test.index.isin(test_indices), 'body']
train_y_ig = train_test.loc[~train_test.index.isin(test_indices), 'category']
test_y_ig = train_test.loc[train_test.index.isin(test_indices), 'category']

In [20]:
os.chdir('..')

In [21]:
test_X_ig.sort_index().index == test_y_ig.sort_index().index

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [22]:
train_test.to_csv(r'task-2-nlp-modeling\train_test_data\instagram\train_test.csv', encoding='utf-8')
train_X_ig.to_csv(r'task-2-nlp-modeling\train_test_data\instagram\train_X_ig.csv', encoding='utf-8')
test_X_ig.to_csv(r'task-2-nlp-modeling\train_test_data\instagram\test_X_ig.csv', encoding='utf-8')
train_y_ig.to_csv(r'task-2-nlp-modeling\train_test_data\instagram\train_y_ig.csv', encoding='utf-8')
test_y_ig.to_csv(r'task-2-nlp-modeling\train_test_data\instagram\test_y_ig.csv', encoding='utf-8')