In [1]:
import os
import pandas as pd
import numpy as np
import re
import spacy


In [2]:
dataset=pd.read_csv('FlowerDatabase.csv')  

Looking at the head of the Dataset

In [3]:
dataset.head()

Unnamed: 0,Name,Desc,PlantType,Color,HardinessZones,BloomsIn,Height,SoilNeeds,SunNeeds,WaterNeeds,Maintenance,RelatedFlowers
0,Achillea (Yarrow),"<p>Achillea, commonly known as Yarrow, is a ge...",Perennials,Yarrow comes in a wide range of colors includi...,3-9,"Spring, Summer",This plant usually grows between 30 and 90 cm ...,"Achillea will grow best in well drained soil, ...",Full sun,low,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da..."
1,Aconitum (Aconite; Monkshood; Wolfsbane),"<p>Aconitum, also known as Aconite, is a genus...",Perennials,"The flowers come in shades of blue, purple, wh...",3-8,"Autumn, Summer",Aconite is a tall plant that grows from 30 cm ...,"Aconite will perform best in rich, moist and w...","Full sun, Partial sun",avarage,medium,"['ageratum', 'asters', 'bellis', 'cosmos', 'da..."
2,Agapanthus (African Lily; Lily of the Nile),<p>Agapanthus is a genus composed of <strong>a...,"Bulbs, Perennials","These flowers are usually blue, white and purple.",6-11,"Autumn, Summer",Agapanthus range in height from just 20 cm (8 ...,"Agapanthus grows best in fertile, moist and we...","Full sun, Partial sun",avarage,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da..."
3,Ageratum (Whiteweed; Flossflower),<p>Ageratum is a genus of <strong>about</stron...,Annuals,"The flowers come in shades of blue, purple, re...",2-11,"Autumn, Summer",These plants can grow up to 60 cm (2 ft) in he...,"Ageratum prefers moist, but well drained soil,...","Full sun, Partial sun",avarage,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da..."
4,Ajuga (Bugle; Bugleweed),<p>Ajuga is a genus of <strong>about 70 specie...,"Annuals, Perennials","The flowers are usually blue, purple and white.",3-10,"Spring, Summer",These plants can grow up to 30 cm (1 ft) tall.,The plant grows best in moist and well drained...,"Partial sun, Shade",avarage,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da..."


Looking at the Information in the Dataset

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            223 non-null    object
 1   Desc            223 non-null    object
 2   PlantType       222 non-null    object
 3   Color           223 non-null    object
 4   HardinessZones  223 non-null    object
 5   BloomsIn        223 non-null    object
 6   Height          223 non-null    object
 7   SoilNeeds       223 non-null    object
 8   SunNeeds        223 non-null    object
 9   WaterNeeds      223 non-null    object
 10  Maintenance     223 non-null    object
 11  RelatedFlowers  223 non-null    object
dtypes: object(12)
memory usage: 21.0+ KB


Creating Functions to Clean the Dataset

In [5]:
# Removing the webscraped tags using regex

def RemoveTags(series):   
    series = re.sub("<(\/)?(strong|p|br|\n)\/?>", " ", series)
    return series


# Non-Truncating the data
#pd.set_option('display.max_colwidth', -1)

Using Regex and the lambda function to clean the tags from the webscraper

In [6]:
dataset['Desc']=dataset['Desc'].apply(lambda x: RemoveTags(x))
dataset['Desc'].head(10)

0     Achillea, commonly known as Yarrow, is a genu...
1     Aconitum, also known as Aconite, is a genus o...
2     Agapanthus is a genus composed of  about 10 s...
3     Ageratum is a genus of  about   50 species  o...
4     Ajuga is a genus of  about 70 species  in the...
5     Alcea, also known as Hollyhock, is a genus of...
6     Allium is a genus of  more than 800 species  ...
7     Alstroemeria, also known as Peruvian Lily, is...
8     Amaryllis is a genus of two species of flower...
9     Amsonia is a genus of  about 20 species  of p...
Name: Desc, dtype: object

Using Natural Language Processing to extract the colors from the text and regex as a filter

In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc_array=[]
colors_array=[]



for value in dataset['Color']:
    text=value
    doc = nlp(text)
    #doc_array.append(doc)
    for token in doc:
        if token.pos_=='ADJ':
            doc_array.append(token)

#color_df=pd.DataFrame(doc_array,columns=['Colors'])            # Creating a DataFrame
#color_df=color_df.astype(str)                                  # Coverting to a string           
#color_df['Colors'].unique()                                    # Finding the unique values

#The colors that were return and cleaned from the dataset
color_array=['white','yellow','purple','orange','pink','red','blue','black','green','violet','brown','gray','golden']


#Using Regex for the pattern matching
for i in dataset['Color']:
    pattern = r'\b(white|yellow|purple|orange|pink|red|blue|black|green|violet|brown|gray|golden)\b'
    matches = re.findall(pattern, i)
    if len(matches)==0:
        matches="no data"
        colors_array.append(matches)
    else:
        colors_array.append(matches)
    







Colors added to the dataframe with new column

In [8]:
dict_list= {}

for values in range(len(colors_array)): 
    dict_list[values]={"colors":colors_array[values]}


df = pd.DataFrame.from_dict(dict_list, orient='index')   
dataset['Colors List']=df 

Creating new dataframe

In [9]:
new_dataset=dataset.drop(['Color'],axis=1)
new_dataset
   

Unnamed: 0,Name,Desc,PlantType,HardinessZones,BloomsIn,Height,SoilNeeds,SunNeeds,WaterNeeds,Maintenance,RelatedFlowers,Colors List
0,Achillea (Yarrow),"Achillea, commonly known as Yarrow, is a genu...",Perennials,3-9,"Spring, Summer",This plant usually grows between 30 and 90 cm ...,"Achillea will grow best in well drained soil, ...",Full sun,low,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[white, yellow, purple, orange, pink, red]"
1,Aconitum (Aconite; Monkshood; Wolfsbane),"Aconitum, also known as Aconite, is a genus o...",Perennials,3-8,"Autumn, Summer",Aconite is a tall plant that grows from 30 cm ...,"Aconite will perform best in rich, moist and w...","Full sun, Partial sun",avarage,medium,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[blue, purple, white, yellow, pink]"
2,Agapanthus (African Lily; Lily of the Nile),Agapanthus is a genus composed of about 10 s...,"Bulbs, Perennials",6-11,"Autumn, Summer",Agapanthus range in height from just 20 cm (8 ...,"Agapanthus grows best in fertile, moist and we...","Full sun, Partial sun",avarage,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[blue, white, purple]"
3,Ageratum (Whiteweed; Flossflower),Ageratum is a genus of about 50 species o...,Annuals,2-11,"Autumn, Summer",These plants can grow up to 60 cm (2 ft) in he...,"Ageratum prefers moist, but well drained soil,...","Full sun, Partial sun",avarage,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[blue, purple, red, pink, white]"
4,Ajuga (Bugle; Bugleweed),Ajuga is a genus of about 70 species in the...,"Annuals, Perennials",3-10,"Spring, Summer",These plants can grow up to 30 cm (1 ft) tall.,The plant grows best in moist and well drained...,"Partial sun, Shade",avarage,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[blue, purple, white]"
...,...,...,...,...,...,...,...,...,...,...,...,...
218,Yucca (Yucca),Yucca is a genus of about 50 species of eve...,"Cactus - Succulents, Shrubs, Trees",4-11,"Spring, Autumn, Summer","Yucca grows from 60 cm to 4,5 metres (2 to 15 ...",This plant grows best in sandy and well draine...,"Full sun, Partial sun",low,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[green, white]"
219,Zantedeschia (Calla Lily; Arum Lily),Zantedeschia is a genus in the family Aracea...,"Bulbs, Perennials",3-10,"Spring, Summer",The plant grows between 30 and 90 cm (1 to 3 f...,Zantedeschia grows best in moist and well-drai...,"Full sun, Partial sun",avarage,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[white, yellow, orange, pink, purple, black]"
220,Zephyranthes (Rain Lily; Fairy Lily),Zephyranthes is a genus of about 90 species ...,"Annuals, Bulbs, Perennials",7-11,"Spring, Autumn, Summer",The plant can grow up to 30 cm (1 ft) in height.,Zephyranthes prefers well-drained soil with a ...,"Full sun, Partial sun",avarage,medium,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[yellow, pink, white]"
221,Zinnia (Zinnia),"Zinnia is a genus of the family Asteraceae ,...","Annuals, Perennials, Shrubs",2-11,"Autumn, Summer",Zinnias range in size from dwarf species that ...,"Zinnias prefer fertile and moist, but well-dra...",Full sun,avarage,low,"['ageratum', 'asters', 'bellis', 'cosmos', 'da...","[yellow, orange, white, red, pink, purple]"
