# preparing message data for text analysis

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

In [2]:
text = pd.read_csv('output.csv')

In [3]:
# view a sample of the data to see a greater variety in message types
text.sample(frac=1/2)

Unnamed: 0,channel_type,channel_name,id,type,date,edited,from,text,type2,text2
199527,private_group,21:00 (10/07) Muhammed (2)🏆3️⃣,1289166,message,2018-06-21T21:16:40,1970-01-01T02:00:00,Thabo Msweli,Monday at 9pm 📆,,
987098,private_group,16:00 (10/07) Samhaa (3)🏆3️⃣,1071830,message,2018-05-31T13:38:53,1970-01-01T02:00:00,Simphiwe Mfaba,Well done!,,
80241,private_group,21:00 (10/07) Zakariyya (3)🏆3️⃣,997292,message,2018-05-24T21:00:18,1970-01-01T02:00:00,Zakariyya Kaka,Ready to start,,
718958,private_group,20:00 (10/07) Chavanee (2)🏆3️⃣,79368,message,2018-03-13T20:08:17,1970-01-01T02:00:00,Zinhle Dlamini,👍,,
316174,private_group,20:00 (26/03) Zahra (3) 🏆3,867973,message,2018-05-15T19:16:05,1970-01-01T02:00:00,ThishaBot,1) Translate the following sentences or words ...,,
265810,private_group,17:00 (10/07) Nikola (2)🏆3️⃣,259227,message,2018-03-26T17:14:24,1970-01-01T02:00:00,TZ77 Sandisiwe Luthuli,👍,,
10018,private_group,21:00(03/07)Yazdan (3)🏆3️⃣,672089,message,2018-04-27T09:09:20,1970-01-01T02:00:00,ThishaBot,TRANSLATE TO NEGATIVE:;;They are speaking. Th...,,
161712,private_group,18:00 (10/07) Abongile (2)🏆3️⃣,1136381,message,2018-06-07T18:11:47,1970-01-01T02:00:00,TZ84 Karabo Masoabi,-Isa ;As in 'phakamisa' 😊,,
112199,private_group,20:00 (10/07) Sadia (1)🏆3️⃣,79883,message,2018-03-13T20:13:02,1970-01-01T02:00:00,Nondumiso Zondo,Ngiyabonga. See you tomorrow ☺️,,
8997,private_group,15:30(12/06)Muhammed Akoojee (3) (5)🏆,1082052,message,2018-06-01T11:01:33,1970-01-01T02:00:00,TZ63 Winile Mnikathi,"Oh he's here😇,awesome 👍",,


In [4]:
# ignore this channel for now
text = text[text['channel_name'] != 'Project Dokotela Tutors'];

## fix error where tutors weren't labeled 

In [5]:
#names = text['from'].unique()

data generated by thomas, in google docs

In [6]:
all_tutors = pd.read_csv('all_tutors.csv')

In [7]:
# double checking; file name is misleading
all_tutors = all_tutors[all_tutors['Is Tutor'] == 'Yes'];

In [8]:
# create a copy because we're renaming a column we don't want permanently renamed
text2 = text.copy()

In [9]:
# rename column for proper merging
text2.rename(columns={'from':'Full Name'}, inplace=True);

In [10]:
all_tutors.head()

Unnamed: 0,Screen Names,Telegram ID,Student Number,Full Name,Is Tutor,Can Ignore
4,Zandisiwe Buthelez,492215600,0,Zandisiwe Buthelezi,Yes,No
10,TZ77 Sandisiwe Luthuli,480490731,0,Sandisiwe Luthuli,Yes,No
16,Buhle Mathews,489406670,1077241,Buhle Mathews,Yes,No
24,TZ155 Nontobeko Mthembu,512174924,0,Nontobeko Mthembu,Yes,No
28,TZ137 Lungile Ntuli,489855254,0,Lungile Ntuli,Yes,No


tutors who are mislabeled are missing 'TZ' in front of their name. Counterintuitively, an inner join the tutor dataset and full dataset will show us the missing tutors-- the tutors who should have TZ in front of their name but don't, therefore matching with the all_tutors list. 

In [11]:
names = pd.merge(text2, all_tutors, on='Full Name', how='inner')

In [12]:
print(text2['Full Name'].nunique()) 
print(all_tutors['Full Name'].nunique())

635
54


In [13]:
names['Full Name'].nunique()

24

In [14]:
# this is a series of all the unlabeled names
names = pd.Series(names['Full Name'].unique())

In [15]:
names

0              Buhle Mathews
1         Siyasanga Mbikwana
2               Thabo Msweli
3            Gcinile Thabede
4           Sinqobile Mkhize
5            Ayanda Dubazana
6             Simphiwe Mfaba
7            Noxolo Vilakazi
8              Bongiwe Nyawo
9         Siphesihle Manqele
10    Sinenhlanhla Mkhwanazi
11        Lebogang Kegoamang
12        Slindokuhle Bhengu
13            Bongiwe Segasa
14           Kholeka Mdakane
15            Zinhle Dlamini
16           Nondumiso Zondo
17        Thandekile Khumalo
18          Nonhlanhla Nzama
19          Confidence Sambo
20            Buyisiwe Njoko
21         Bridgette Khumalo
22          Andile Nhlabathi
23       Khanyisile Madikane
dtype: object

use np.where to match tutor names in full dataset with series of missing names, and append the name with TZ if that is the case. 

In [16]:
text['from'] = np.where(text['from'].isin(names),'TZ '+ text['from'] , text['from'])

In [17]:
text.sample(frac=1/4)

Unnamed: 0,channel_type,channel_name,id,type,date,edited,from,text,type2,text2
50142,private_group,20:00 (10/07) Matthew (3)🏆3️⃣,1069223,message,2018-05-31T07:06:33,1970-01-01T02:00:00,ThishaBot,Correct answer: (e) Ulungele ukufunda isiZulu?,,
497241,private_group,18:00(10/07) Davida (2)🏆3️⃣,748566,message,2018-05-04T17:09:30,1970-01-01T02:00:00,TZ Bongiwe Nyawo,Thanks! Nawe ube nempela sonto enhle 😊 (have a...,,
490383,private_group,18:30 (10/07) Kholofelo (1) 🏆3⃣,1337436,message,2018-06-27T20:53:45,1970-01-01T02:00:00,ThishaBot,Translate and send a VN of the following: Are...,,
347523,private_group,18:30 (11/07) Dylan (2)🏆3️⃣,787632,message,2018-05-08T18:30:03,1970-01-01T02:00:00,ThishaBot,- Sawubona 😊,mention,@Bunjoyi
285059,private_group,17:00 (10/07) Michael (1)🏆3️⃣,1482099,message,2018-07-11T17:13:53,1970-01-01T02:00:00,TZ Slindokuhle Bhengu,You can in isizulu we will learn that,,
801654,private_group,18:00 (10/07) Ciara (2)🏆3️⃣,270364,message,2018-03-26T20:02:44,1970-01-01T02:00:00,ThishaBot,And that's it for lesson 5 - nice job! In the ...,,
358686,private_group,18:00 (10/07) Aidan (2)🏆3️⃣,758043,message,2018-05-07T15:00:07,1970-01-01T02:00:00,ThishaBot,- Sawubona 😊,mention,@Bunjoyi
960160,private_group,16:00 (19/06) Simon (3) 🏆 3,359472,message,2018-04-03T07:55:45,1970-01-01T02:00:00,Simon Fraser-King,Wow that was a battle to teach my head,,
914578,private_group,08:00 (10/07) Mahomed (1)🏆3️⃣,93086,message,2018-03-14T14:32:42,1970-01-01T02:00:00,Mahomed Aboo,Sawubona Thandekile,,
60271,private_group,19:30 (10/07) Caryn (3)🏆3️⃣,668608,message,2018-04-26T19:45:30,1970-01-01T02:00:00,TZ Buhle Mathews,👍🏾,,


<h2> Type identification </h2>

### creating a new attribute that succinctly categorizes the sender of each message

In [None]:
text['from_2'] = pd.np.where(text['from'].str.contains("TZ", case=True), "tutor",
                   pd.np.where(text['from'].str.contains("ThishaBot"), "bot",
                   pd.np.where(text['from'].str.contains("Setup"), "setup",
                   pd.np.where(text['from'].str.contains("Set up"), "setup",
                   pd.np.where(text['from'].str.contains("UthiniSupport"), "UthiniSupport", "student")))))

In [None]:
# drop irrelevant columns
text.drop(['type','type2', 'text2', 'edited'], axis=1, inplace=True)

In [20]:
text['text'].isnull().value_counts()

False    1084520
True        1860
Name: text, dtype: int64

## output as csv with all tutors identified and new attribute added

In [21]:
#text.to_csv('labeled_output.csv', index=False) 