# Feature Extraction from Title

In [1]:
import string
import pandas as pd
import numpy as np
import re

In [2]:
Drugs = pd.read_csv("Drug Categories.csv")
Drugs.head(10)

Unnamed: 0,Category,Subcategory,Lookup,Subcategory2
0,Analgesics,Acetimophen,Acetimophen,
1,Analgesics,Carisoprodol,Carisoprodol,
2,Analgesics,Co-codamol,Co-codamol,
3,Analgesics,Co-dydramol,Co-dydramol,
4,Analgesics,Pethidine,Pethidine,
5,Analgesics,Suboxone,Suboxone,
6,Analgesics,Tilidine,Tilidine,
7,Analgesics,Tramadol,Tramadol,
8,Cannabis,Weed,Weed,
9,Cannabis,Weed,Marijuana,


In [3]:
####DATA EXTRACTION FUNCTIONS####

#Find_weight searches the title of a product for a numerical weight and returns if found.
def find_weight(title):
    #Keywords for weight
    words = [" g ","mg", "kg", "ug", "grams", "gr.","g", "gr", "gram", "g.", "gramm", "oz","ounce"]
    
    #Basic cleaning
    s = title.lower() #Set all characters to lowercase
    s= re.sub("[()]","",s) #Remove certain punctuation
    s=s.replace("***"," ") #Replace *** symbol with whitespace
    s=s.replace("..."," ") #Replace *** symbol with whitespace
    s = s.split() #Split the title into tokens
    
    #Searches the title for any of the keywrods
    for word in words:
        #Loops through each token in the title
        for i in range(0,len(s)):
            #If the token equals the keyword:
            if s[i]==word:
                #Try to convert the preceding token into a float (ex. 20 grams)
                try:
                    #Converts all weights to grams
                    if word in ["mg"]: return float(s[i-1].split("-",1)[0])/1000
                    elif word in ["kg"]: return float(s[i-1].split("-",1)[0])*1000
                    elif word in ["ug"]: return float(s[i-1].split("-",1)[0])/1000000
                    elif word in ["oz","ounce"]: return float(s[i-1].split("-",1)[0])*28.3495
                    else: return float(s[i-1].split("-",1)[0])
                #If the preceding token is not a number, continue
                except: continue
            #If the token contains the keyword:
            elif word in s[i]:
                #Try to convert the current token (with the keyword removed) into a float (ex. 200mg)
                try:
                    if word in ["mg"]: return float(s[i].replace(word,"").split("-",1)[0])/1000
                    elif word in ["kg"]: return float(s[i].replace(word,"").split("-",1)[0])*1000
                    elif word in ["ug"]: return float(s[i].replace(word,"").split("-",1)[0])/1000000
                    elif word in ["oz","ounce"]: return float(s[i-1].split("-",1)[0])*28.3495
                    else: return float(s[i].replace(word,"").split("-",1)[0])
                #If the current token is not a number, continue
                except: continue
    #Returns None if no weight can be found
    return None 

#Find_count searches the title of a product for a numerical count and returns if found.
def find_count(title):
    #Keywords for count
    words = ["x", "tabs", "tablets", "capsules", "pills"]
    
    #Basic cleaning
    s = title.lower() #Set all characters to lowercase
    s= re.sub("[()]","",s) #Remove certain punctuation
    s=s.replace("***"," ") #Replace *** symbol with whitespace
    s = s.split() #Split the title into tokens
    
    #Searches the title for any of the keywrods
    for word in words:
        #Loops through each token in the title
        for i in range(0,len(s)):
            #If the token equals the keyword:
            if s[i]==word:
                #Try to convert the preceding token into a float (ex. 20 x pills)
                try: return float(s[i-1].split("-",1)[0])
                #Else to convert the price token into a float (ex. pills x 20)
                except:
                    try: return float(s[i+1].split("-",1)[0])
                    except: continue
            #If the token contains the keyword:
            elif word in s[i]:
                #Try to convert the current token (with the keyword removed) into a float
                try: return float(s[i].replace(word,"").split("-",1)[0])
                except: continue
    
    #If the keyword search fails, try returning the first number found that does not equal the weight.
    for i in range(0,len(s)):
        if s[i].isdigit() and float(s[i])<>find_weight(title): 
            try: return float(s[i])
            except: continue
    
    #If no number can be found, return None
    return None 

#Find_category takes a product entry and classifies it into a Category
def find_category(row):
    #Converts the product title to lowercase
    y = str(row["title"]).lower()
    #Loops through a lookup table of known types and their categories
    for i in range(0,len(Drugs["Lookup"])):
        word = Drugs["Lookup"][i] #Pulls the lookup keyword
        word = word.lower() #Converts to lowercase
        
        #If the Lookup word is in the product title, return the Category and Subcategory associated with it
        if word in y: return (Drugs["Subcategory"][i], Drugs["Category"][i])
    
    #Some products have a populated Category, but the classifications are not standard.
    #Some are very specific (equivalent to the title) while other are too generic.
    #Converts product category to lowercase
    y = str(row["category"]).lower()
    #Loops through the same lookup table
    for i in range(0,len(Drugs["Lookup"])): 
        word = Drugs["Lookup"][i]
        word = word.lower()
        #If the Lookup word is in the product category, return the Category and Subcategory associated with it
        if word in y: return (Drugs["Subcategory"][i], Drugs["Category"][i])
        
    #If none of the lookup values are in the Title or the Category, use the original Category as Cat and SubCat
    return (row["category"],row["category"])

In [4]:
#Read in raw scraped data
new = pd.read_csv("output.csv")
print new.shape
new.head(10)

(100002, 10)


Unnamed: 0,title_date,category,date,market,price,price_dollar,ships_from,ships_to,title,vendor
0,2cdcc469ef79378b8daa2accf657dbfd,,10/26/2014 19:00,Silk Road 2,4.3849,1530.2,,,EU Passport 1:1 Good quality,threekings
1,67b9e2d7823f022055171e68f9dc7c81,,8/26/2014 19:00,Silk Road 2,0.015625,7.9755,,,The Walking Dead Seasons 1-4 (AVI),wakeside917
2,9f3552e9f27fda3b83ac8d74e3821829,Alcohol,1/15/2014 18:00,Silk Road 2,0.65731,556.98,PHISHING WARNING:,Confirm that your browser is pointed to our re...,Custom order for foryoumyfr13nd,Alexmack380
3,e2e83eb0a80e26976bfeb700cba54bce,,9/14/2014 19:00,Silk Road 2,4.7957,2260.5,,,250tabs x 250ug LSD ---FREE POSTAGE---,top_gear_uk
4,d394565b83e410c446d7aa847e53c5df,DMT,10/3/2014 19:00,Silk Road 2,0.0231,7.5188,United Kingdom,Worldwide,Breakthrough DMT sample 50mg,9
5,3c1bd1c9b2314d0c21e1178e11955173,,10/10/2014 19:00,Silk Road 2,0.16441,59.055,,,5 Gr of Speed Paste 72%,kriminale
6,c4d4396922c138a512a441c11f73c067,,2/10/2014 18:00,Silk Road 2,0.008896,5.9553,United States,Worldwide,backroomcastingcouch.com Siterip (27GB),wakeside917
7,b99a3fbcec250857b99a79f7caefc1fc,,10/14/2014 19:00,Silk Road 2,0.53248,208.65,,,VIAGRA 10 PACKS OF 4,harrywinston
8,dc9a7e84d7ef868a48a81426e9cfb022,,3/9/2014 19:00,Silk Road 2,0.34441,215.54,Netherlands,Worldwide,50x Green Herbal life 90-120MG MDMA,salt-pepper
9,e196c6fe6fed3b2a750381db36e8fb1d,,4/27/2014 19:00,Silk Road 2,0.94595,413.44,Undeclared,Worldwide,Methenolone Enanthate - 10g - Primobolan,steroid-depot


In [5]:
x,y,a,b = [],[],[],[]

#For each product in the raw data
for i in range(0, new.shape[0]):
    x.append(find_weight(str(new.iloc[i,]["title"]))) #Search for weight
    y.append(find_count(str(new.iloc[i,]["title"])))  #Search for count
    c,d = find_category(new.iloc[i,])                 #Search for Category and SubCategory
    a.append(c)
    b.append(d)
    
new["Weight"]=pd.DataFrame(x)  #Create "Weight" field in data with results of find_weight()
new["Count"]=pd.DataFrame(y)   #Create "Count" field in data with results of find_count()
new["SubCat2"]=pd.DataFrame(a) #Create "SubCat2" field in data with results of find_category()
new["Cat2"]=pd.DataFrame(b)    #Create "Cat2" field in data with results of find_category()

In [3]:
#Show new data
new[["title","Weight","Count","category","Cat2","SubCat2"]]

In [6]:
#Export new data to csv file
new.to_csv("cleaned_ouput.csv",",",index=False)

### Working Portion of Code

In [70]:
x = "Xanax/ Alprax 1mg - 500Pills"

words = ["x", "tabs", "tablets", "capsules", "pills"]

s = x.lower()
s=re.sub("[(),]","",s)
s=s.replace("[","").replace("]","")
s = s.split()


        
#print find_weight(x)

for word in words:
    for i in range(0,len(s)):
        if s[i]==word:
            try:
                print s[i-1]
                print float(s[i-1].split("-",1)[0])
            except:
                try:
                    print s[i+1]
                    print float(s[i+1].split("-",1)[0])
                except: continue
        elif word in s[i]:
            try: print float(s[i].replace(word,"").split("-",1)[0])
            except: continue
                

for i in range(0,len(s)):
    if s[i].isdigit() and float(s[i])<>find_weight(x): print s[i]


500.0


In [80]:
x="1 gr *High Quality Fishscale Cocaine* Original Product (85% - 92 % - Purity) *FREE SHIPPING* |"
x=x.replace("*"," ")
find_weight(x)

1.0

In [82]:
new[:5000]

Unnamed: 0,title_date,category,date,market,price,price_dollar,ships_from,ships_to,title,vendor,Weight,Count,SubCat2,Cat2
0,2cdcc469ef79378b8daa2accf657dbfd,,10/26/2014 19:00,Silk Road 2,4.3849,1530.2,,,EU Passport 1:1 Good quality,threekings,,,,
1,67b9e2d7823f022055171e68f9dc7c81,,8/26/2014 19:00,Silk Road 2,0.015625,7.9755,,,The Walking Dead Seasons 1-4 (AVI),wakeside917,,,,
2,9f3552e9f27fda3b83ac8d74e3821829,Alcohol,1/15/2014 18:00,Silk Road 2,0.65731,556.98,PHISHING WARNING:,Confirm that your browser is pointed to our re...,Custom order for foryoumyfr13nd,Alexmack380,,,Alcohol,Other
3,e2e83eb0a80e26976bfeb700cba54bce,,9/14/2014 19:00,Silk Road 2,4.7957,2260.5,,,250tabs x 250ug LSD ---FREE POSTAGE---,top_gear_uk,0.00025,250,LSD,Psychadelics
4,d394565b83e410c446d7aa847e53c5df,DMT,10/3/2014 19:00,Silk Road 2,0.0231,7.5188,United Kingdom,Worldwide,Breakthrough DMT sample 50mg,9,0.05000,,DMT,Psychadelics
5,3c1bd1c9b2314d0c21e1178e11955173,,10/10/2014 19:00,Silk Road 2,0.16441,59.055,,,5 Gr of Speed Paste 72%,kriminale,5.00000,,Speed,Stimulants
6,c4d4396922c138a512a441c11f73c067,,2/10/2014 18:00,Silk Road 2,0.008896,5.9553,United States,Worldwide,backroomcastingcouch.com Siterip (27GB),wakeside917,,,,
7,b99a3fbcec250857b99a79f7caefc1fc,,10/14/2014 19:00,Silk Road 2,0.53248,208.65,,,VIAGRA 10 PACKS OF 4,harrywinston,,10,,
8,dc9a7e84d7ef868a48a81426e9cfb022,,3/9/2014 19:00,Silk Road 2,0.34441,215.54,Netherlands,Worldwide,50x Green Herbal life 90-120MG MDMA,salt-pepper,0.09000,50,MDMA,Ecstasy
9,e196c6fe6fed3b2a750381db36e8fb1d,,4/27/2014 19:00,Silk Road 2,0.94595,413.44,Undeclared,Worldwide,Methenolone Enanthate - 10g - Primobolan,steroid-depot,10.00000,,,
